howard.objects.variants
1import csv 2import gc 3import gzip 4import io 5import multiprocessing 6import os 7import random 8import re 9import shlex 10import sqlite3 11import subprocess 12from tempfile import NamedTemporaryFile, TemporaryDirectory 13import tempfile 14import duckdb 15import json 16import yaml 17import argparse 18import Bio.bgzf as bgzf 19import pandas as pd 20from pyfaidx import Fasta 21import numpy as np 22import vcf 23import logging as log 24import fastparquet as fp 25from multiprocesspandas import applyparallel 26 27from howard.functions.commons import * 28from howard.objects.database import * 29from howard.functions.databases import * 30from howard.functions.utils import * 31 32 33class Variants: 34 35 def __init__( 36 self, 37 conn=None, 38 input: str = None, 39 output: str = None, 40 config: dict = {}, 41 param: dict = {}, 42 load: bool = False, 43 ) -> None: 44 """ 45 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 46 header 47 48 :param conn: the connection to the database 49 :param input: the input file 50 :param output: the output file 51 :param config: a dictionary containing the configuration of the model 52 :param param: a dictionary containing the parameters of the model 53 """ 54 55 # Init variables 56 self.init_variables() 57 58 # Input 59 self.set_input(input) 60 61 # Config 62 self.set_config(config) 63 64 # Param 65 self.set_param(param) 66 67 # Output 68 self.set_output(output) 69 70 # connexion 71 self.set_connexion(conn) 72 73 # Header 74 self.set_header() 75 76 # Samples 77 self.set_samples() 78 79 # Load data 80 if load: 81 self.load_data() 82 83 def set_samples(self, samples: list = None) -> list: 84 """ 85 The function `set_samples` sets the samples attribute of an object to a provided list or 86 retrieves it from a parameter dictionary. 87 88 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 89 input and sets the `samples` attribute of the class to the provided list. If no samples are 90 provided, it tries to get the samples from the class's parameters using the `get_param` method 91 :type samples: list 92 :return: The `samples` list is being returned. 93 """ 94 95 if not samples: 96 samples = self.get_param().get("samples", {}).get("list", None) 97 98 self.samples = samples 99 100 return samples 101 102 def get_samples(self) -> list: 103 """ 104 This function returns a list of samples. 105 :return: The `get_samples` method is returning the `samples` attribute of the object. 106 """ 107 108 return self.samples 109 110 def get_samples_check(self) -> bool: 111 """ 112 This function returns the value of the "check" key within the "samples" dictionary retrieved 113 from the parameters. 114 :return: The method `get_samples_check` is returning the value of the key "check" inside the 115 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 116 method. If the key "check" is not found, it will return `False`. 117 """ 118 119 return self.get_param().get("samples", {}).get("check", True) 120 121 def set_input(self, input: str = None) -> None: 122 """ 123 The function `set_input` takes a file name as input, extracts the name and extension, and sets 124 attributes in the class accordingly. 125 126 :param input: The `set_input` method in the provided code snippet is used to set attributes 127 related to the input file. Here's a breakdown of the parameters and their usage in the method: 128 :type input: str 129 """ 130 131 if input and not isinstance(input, str): 132 try: 133 self.input = input.name 134 except: 135 log.error(f"Input file '{input} in bad format") 136 raise ValueError(f"Input file '{input} in bad format") 137 else: 138 self.input = input 139 140 # Input format 141 if input: 142 input_name, input_extension = os.path.splitext(self.input) 143 self.input_name = input_name 144 self.input_extension = input_extension 145 self.input_format = self.input_extension.replace(".", "") 146 147 def set_config(self, config: dict) -> None: 148 """ 149 The set_config function takes a config object and assigns it as the configuration object for the 150 class. 151 152 :param config: The `config` parameter in the `set_config` function is a dictionary object that 153 contains configuration settings for the class. When you call the `set_config` function with a 154 dictionary object as the argument, it will set that dictionary as the configuration object for 155 the class 156 :type config: dict 157 """ 158 159 self.config = config 160 161 def set_param(self, param: dict) -> None: 162 """ 163 This function sets a parameter object for the class based on the input dictionary. 164 165 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 166 as the `param` attribute of the class instance 167 :type param: dict 168 """ 169 170 self.param = param 171 172 def init_variables(self) -> None: 173 """ 174 This function initializes the variables that will be used in the rest of the class 175 """ 176 177 self.prefix = "howard" 178 self.table_variants = "variants" 179 self.dataframe = None 180 181 self.comparison_map = { 182 "gt": ">", 183 "gte": ">=", 184 "lt": "<", 185 "lte": "<=", 186 "equals": "=", 187 "contains": "SIMILAR TO", 188 } 189 190 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 191 192 self.code_type_map_to_sql = { 193 "Integer": "INTEGER", 194 "String": "VARCHAR", 195 "Float": "FLOAT", 196 "Flag": "VARCHAR", 197 } 198 199 self.index_additionnal_fields = [] 200 201 def get_indexing(self) -> bool: 202 """ 203 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 204 returns False. 205 :return: The value of the indexing parameter. 206 """ 207 208 return self.get_param().get("indexing", False) 209 210 def get_connexion_config(self) -> dict: 211 """ 212 The function `get_connexion_config` returns a dictionary containing the configuration for a 213 connection, including the number of threads and memory limit. 214 :return: a dictionary containing the configuration for the Connexion library. 215 """ 216 217 # config 218 config = self.get_config() 219 220 # Connexion config 221 connexion_config = {} 222 threads = self.get_threads() 223 224 # Threads 225 if threads: 226 connexion_config["threads"] = threads 227 228 # Memory 229 # if config.get("memory", None): 230 # connexion_config["memory_limit"] = config.get("memory") 231 if self.get_memory(): 232 connexion_config["memory_limit"] = self.get_memory() 233 234 # Temporary directory 235 if config.get("tmp", None): 236 connexion_config["temp_directory"] = config.get("tmp") 237 238 # Access 239 if config.get("access", None): 240 access = config.get("access") 241 if access in ["RO"]: 242 access = "READ_ONLY" 243 elif access in ["RW"]: 244 access = "READ_WRITE" 245 connexion_db = self.get_connexion_db() 246 if connexion_db in ":memory:": 247 access = "READ_WRITE" 248 connexion_config["access_mode"] = access 249 250 return connexion_config 251 252 def get_duckdb_settings(self) -> dict: 253 """ 254 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 255 string. 256 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 257 """ 258 259 # config 260 config = self.get_config() 261 262 # duckdb settings 263 duckdb_settings_dict = {} 264 if config.get("duckdb_settings", None): 265 duckdb_settings = config.get("duckdb_settings") 266 duckdb_settings = full_path(duckdb_settings) 267 # duckdb setting is a file 268 if os.path.exists(duckdb_settings): 269 with open(duckdb_settings) as json_file: 270 duckdb_settings_dict = yaml.safe_load(json_file) 271 # duckdb settings is a string 272 else: 273 duckdb_settings_dict = json.loads(duckdb_settings) 274 275 return duckdb_settings_dict 276 277 def set_connexion_db(self) -> str: 278 """ 279 The function `set_connexion_db` returns the appropriate database connection string based on the 280 input format and connection type. 281 :return: the value of the variable `connexion_db`. 282 """ 283 284 # Default connexion db 285 default_connexion_db = ":memory:" 286 287 # Find connexion db 288 if self.get_input_format() in ["db", "duckdb"]: 289 connexion_db = self.get_input() 290 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 291 connexion_db = default_connexion_db 292 elif self.get_connexion_type() in ["tmpfile"]: 293 tmp_name = tempfile.mkdtemp( 294 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 295 ) 296 connexion_db = f"{tmp_name}/tmp.db" 297 elif self.get_connexion_type() != "": 298 connexion_db = self.get_connexion_type() 299 else: 300 connexion_db = default_connexion_db 301 302 # Set connexion db 303 self.connexion_db = connexion_db 304 305 return connexion_db 306 307 def set_connexion(self, conn) -> None: 308 """ 309 The function `set_connexion` creates a connection to a database, with options for different 310 database formats and settings. 311 312 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 313 database. If a connection is not provided, a new connection to an in-memory database is created. 314 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 315 sqlite 316 """ 317 318 # Connexion db 319 connexion_db = self.set_connexion_db() 320 321 # Connexion config 322 connexion_config = self.get_connexion_config() 323 324 # Connexion format 325 connexion_format = self.get_config().get("connexion_format", "duckdb") 326 # Set connexion format 327 self.connexion_format = connexion_format 328 329 # Connexion 330 if not conn: 331 if connexion_format in ["duckdb"]: 332 conn = duckdb.connect(connexion_db, config=connexion_config) 333 # duckDB settings 334 duckdb_settings = self.get_duckdb_settings() 335 if duckdb_settings: 336 for setting in duckdb_settings: 337 setting_value = duckdb_settings.get(setting) 338 if isinstance(setting_value, str): 339 setting_value = f"'{setting_value}'" 340 conn.execute(f"PRAGMA {setting}={setting_value};") 341 elif connexion_format in ["sqlite"]: 342 conn = sqlite3.connect(connexion_db) 343 344 # Set connexion 345 self.conn = conn 346 347 # Log 348 log.debug(f"connexion_format: {connexion_format}") 349 log.debug(f"connexion_db: {connexion_db}") 350 log.debug(f"connexion config: {connexion_config}") 351 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 352 353 def set_output(self, output: str = None) -> None: 354 """ 355 The `set_output` function in Python sets the output file based on the input or a specified key 356 in the config file, extracting the output name, extension, and format. 357 358 :param output: The `output` parameter in the `set_output` method is used to specify the name of 359 the output file. If the config file has an 'output' key, the method sets the output to the value 360 of that key. If no output is provided, it sets the output to `None` 361 :type output: str 362 """ 363 364 if output and not isinstance(output, str): 365 self.output = output.name 366 else: 367 self.output = output 368 369 # Output format 370 if self.output: 371 output_name, output_extension = os.path.splitext(self.output) 372 self.output_name = output_name 373 self.output_extension = output_extension 374 self.output_format = self.output_extension.replace(".", "") 375 else: 376 self.output_name = None 377 self.output_extension = None 378 self.output_format = None 379 380 def set_header(self) -> None: 381 """ 382 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 383 """ 384 385 input_file = self.get_input() 386 default_header_list = [ 387 "##fileformat=VCFv4.2", 388 "#CHROM POS ID REF ALT QUAL FILTER INFO", 389 ] 390 391 # Full path 392 input_file = full_path(input_file) 393 394 if input_file: 395 396 input_format = self.get_input_format() 397 input_compressed = self.get_input_compressed() 398 config = self.get_config() 399 header_list = default_header_list 400 if input_format in [ 401 "vcf", 402 "hdr", 403 "tsv", 404 "csv", 405 "psv", 406 "parquet", 407 "db", 408 "duckdb", 409 ]: 410 # header provided in param 411 if config.get("header_file", None): 412 with open(config.get("header_file"), "rt") as f: 413 header_list = self.read_vcf_header(f) 414 # within a vcf file format (header within input file itsself) 415 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 416 # within a compressed vcf file format (.vcf.gz) 417 if input_compressed: 418 with bgzf.open(input_file, "rt") as f: 419 header_list = self.read_vcf_header(f) 420 # within an uncompressed vcf file format (.vcf) 421 else: 422 with open(input_file, "rt") as f: 423 header_list = self.read_vcf_header(f) 424 # header provided in default external file .hdr 425 elif os.path.exists((input_file + ".hdr")): 426 with open(input_file + ".hdr", "rt") as f: 427 header_list = self.read_vcf_header(f) 428 else: 429 try: # Try to get header info fields and file columns 430 431 with tempfile.TemporaryDirectory() as tmpdir: 432 433 # Create database 434 db_for_header = Database(database=input_file) 435 436 # Get header columns for infos fields 437 db_header_from_columns = ( 438 db_for_header.get_header_from_columns() 439 ) 440 441 # Get real columns in the file 442 db_header_columns = db_for_header.get_columns() 443 444 # Write header file 445 header_file_tmp = os.path.join(tmpdir, "header") 446 f = open(header_file_tmp, "w") 447 vcf.Writer(f, db_header_from_columns) 448 f.close() 449 450 # Replace #CHROM line with rel columns 451 header_list = db_for_header.read_header_file( 452 header_file=header_file_tmp 453 ) 454 header_list[-1] = "\t".join(db_header_columns) 455 456 except: 457 458 log.warning( 459 f"No header for file {input_file}. Set as default VCF header" 460 ) 461 header_list = default_header_list 462 463 else: # try for unknown format ? 464 465 log.error(f"Input file format '{input_format}' not available") 466 raise ValueError(f"Input file format '{input_format}' not available") 467 468 if not header_list: 469 header_list = default_header_list 470 471 # header as list 472 self.header_list = header_list 473 474 # header as VCF object 475 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 476 477 else: 478 479 self.header_list = None 480 self.header_vcf = None 481 482 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 483 """ 484 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 485 DataFrame based on the connection format. 486 487 :param query: The `query` parameter in the `get_query_to_df` function is a string that 488 represents the SQL query you want to execute. This query will be used to fetch data from a 489 database and convert it into a pandas DataFrame 490 :type query: str 491 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 492 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 493 function will only fetch up to that number of rows from the database query result. If no limit 494 is specified, 495 :type limit: int 496 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 497 """ 498 499 # Connexion format 500 connexion_format = self.get_connexion_format() 501 502 # Limit in query 503 if limit: 504 pd.set_option("display.max_rows", limit) 505 if connexion_format in ["duckdb"]: 506 df = ( 507 self.conn.execute(query) 508 .fetch_record_batch(limit) 509 .read_next_batch() 510 .to_pandas() 511 ) 512 elif connexion_format in ["sqlite"]: 513 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 514 515 # Full query 516 else: 517 if connexion_format in ["duckdb"]: 518 df = self.conn.execute(query).df() 519 elif connexion_format in ["sqlite"]: 520 df = pd.read_sql_query(query, self.conn) 521 522 return df 523 524 def get_overview(self) -> None: 525 """ 526 The function prints the input, output, config, and dataframe of the current object 527 """ 528 table_variants_from = self.get_table_variants(clause="from") 529 sql_columns = self.get_header_columns_as_sql() 530 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 531 df = self.get_query_to_df(sql_query_export) 532 log.info( 533 "Input: " 534 + str(self.get_input()) 535 + " [" 536 + str(str(self.get_input_format())) 537 + "]" 538 ) 539 log.info( 540 "Output: " 541 + str(self.get_output()) 542 + " [" 543 + str(str(self.get_output_format())) 544 + "]" 545 ) 546 log.info("Config: ") 547 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 548 "\n" 549 ): 550 log.info("\t" + str(d)) 551 log.info("Param: ") 552 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 553 "\n" 554 ): 555 log.info("\t" + str(d)) 556 log.info("Sample list: " + str(self.get_header_sample_list())) 557 log.info("Dataframe: ") 558 for d in str(df).split("\n"): 559 log.info("\t" + str(d)) 560 561 # garbage collector 562 del df 563 gc.collect() 564 565 return None 566 567 def get_stats(self) -> dict: 568 """ 569 The `get_stats` function calculates and returns various statistics of the current object, 570 including information about the input file, variants, samples, header fields, quality, and 571 SNVs/InDels. 572 :return: a dictionary containing various statistics of the current object. The dictionary has 573 the following structure: 574 """ 575 576 # Log 577 log.info(f"Stats Calculation...") 578 579 # table varaints 580 table_variants_from = self.get_table_variants() 581 582 # stats dict 583 stats = {"Infos": {}} 584 585 ### File 586 input_file = self.get_input() 587 stats["Infos"]["Input file"] = input_file 588 589 # Header 590 header_infos = self.get_header().infos 591 header_formats = self.get_header().formats 592 header_infos_list = list(header_infos) 593 header_formats_list = list(header_formats) 594 595 ### Variants 596 597 stats["Variants"] = {} 598 599 # Variants by chr 600 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 601 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 602 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 603 by=["CHROM"], kind="quicksort" 604 ) 605 606 # Total number of variants 607 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 608 609 # Calculate percentage 610 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 611 lambda x: (x / nb_of_variants) 612 ) 613 614 stats["Variants"]["Number of variants by chromosome"] = ( 615 nb_of_variants_by_chrom.to_dict(orient="index") 616 ) 617 618 stats["Infos"]["Number of variants"] = int(nb_of_variants) 619 620 ### Samples 621 622 # Init 623 samples = {} 624 nb_of_samples = 0 625 626 # Check Samples 627 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 628 log.debug(f"Check samples...") 629 for sample in self.get_header_sample_list(): 630 sql_query_samples = f""" 631 SELECT '{sample}' as sample, 632 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 633 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 634 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 635 FROM {table_variants_from} 636 WHERE ( 637 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 638 AND 639 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 640 ) 641 GROUP BY genotype 642 """ 643 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 644 sample_genotype_count = sql_query_genotype_df["count"].sum() 645 if len(sql_query_genotype_df): 646 nb_of_samples += 1 647 samples[f"{sample} - {sample_genotype_count} variants"] = ( 648 sql_query_genotype_df.to_dict(orient="index") 649 ) 650 651 stats["Samples"] = samples 652 stats["Infos"]["Number of samples"] = nb_of_samples 653 654 # # 655 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 656 # stats["Infos"]["Number of samples"] = nb_of_samples 657 # elif nb_of_samples: 658 # stats["Infos"]["Number of samples"] = "not a VCF format" 659 660 ### INFO and FORMAT fields 661 header_types_df = {} 662 header_types_list = { 663 "List of INFO fields": header_infos, 664 "List of FORMAT fields": header_formats, 665 } 666 i = 0 667 for header_type in header_types_list: 668 669 header_type_infos = header_types_list.get(header_type) 670 header_infos_dict = {} 671 672 for info in header_type_infos: 673 674 i += 1 675 header_infos_dict[i] = {} 676 677 # ID 678 header_infos_dict[i]["id"] = info 679 680 # num 681 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 682 if header_type_infos[info].num in genotype_map.keys(): 683 header_infos_dict[i]["Number"] = genotype_map.get( 684 header_type_infos[info].num 685 ) 686 else: 687 header_infos_dict[i]["Number"] = header_type_infos[info].num 688 689 # type 690 if header_type_infos[info].type: 691 header_infos_dict[i]["Type"] = header_type_infos[info].type 692 else: 693 header_infos_dict[i]["Type"] = "." 694 695 # desc 696 if header_type_infos[info].desc != None: 697 header_infos_dict[i]["Description"] = header_type_infos[info].desc 698 else: 699 header_infos_dict[i]["Description"] = "" 700 701 if len(header_infos_dict): 702 header_types_df[header_type] = pd.DataFrame.from_dict( 703 header_infos_dict, orient="index" 704 ).to_dict(orient="index") 705 706 # Stats 707 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 708 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 709 stats["Header"] = header_types_df 710 711 ### QUAL 712 if "QUAL" in self.get_header_columns(): 713 sql_query_qual = f""" 714 SELECT 715 avg(CAST(QUAL AS INTEGER)) AS Average, 716 min(CAST(QUAL AS INTEGER)) AS Minimum, 717 max(CAST(QUAL AS INTEGER)) AS Maximum, 718 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 719 median(CAST(QUAL AS INTEGER)) AS Median, 720 variance(CAST(QUAL AS INTEGER)) AS Variance 721 FROM {table_variants_from} 722 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 723 """ 724 725 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 726 stats["Quality"] = {"Stats": qual} 727 728 ### SNV and InDel 729 730 sql_query_snv = f""" 731 732 SELECT Type, count FROM ( 733 734 SELECT 735 'Total' AS Type, 736 count(*) AS count 737 FROM {table_variants_from} 738 739 UNION 740 741 SELECT 742 'MNV' AS Type, 743 count(*) AS count 744 FROM {table_variants_from} 745 WHERE len(REF) > 1 AND len(ALT) > 1 746 AND len(REF) = len(ALT) 747 748 UNION 749 750 SELECT 751 'InDel' AS Type, 752 count(*) AS count 753 FROM {table_variants_from} 754 WHERE len(REF) > 1 OR len(ALT) > 1 755 AND len(REF) != len(ALT) 756 757 UNION 758 759 SELECT 760 'SNV' AS Type, 761 count(*) AS count 762 FROM {table_variants_from} 763 WHERE len(REF) = 1 AND len(ALT) = 1 764 765 ) 766 767 ORDER BY count DESC 768 769 """ 770 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 771 772 sql_query_snv_substitution = f""" 773 SELECT 774 concat(REF, '>', ALT) AS 'Substitution', 775 count(*) AS count 776 FROM {table_variants_from} 777 WHERE len(REF) = 1 AND len(ALT) = 1 778 GROUP BY REF, ALT 779 ORDER BY count(*) DESC 780 """ 781 snv_substitution = ( 782 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 783 ) 784 stats["Variants"]["Counts"] = snv_indel 785 stats["Variants"]["Substitutions"] = snv_substitution 786 787 return stats 788 789 def stats_to_file(self, file: str = None) -> str: 790 """ 791 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 792 into a JSON object, and writes the JSON object to the specified file. 793 794 :param file: The `file` parameter is a string that represents the file path where the JSON data 795 will be written 796 :type file: str 797 :return: the name of the file that was written to. 798 """ 799 800 # Get stats 801 stats = self.get_stats() 802 803 # Serializing json 804 json_object = json.dumps(stats, indent=4) 805 806 # Writing to sample.json 807 with open(file, "w") as outfile: 808 outfile.write(json_object) 809 810 return file 811 812 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 813 """ 814 The `print_stats` function generates a markdown file and prints the statistics contained in a 815 JSON file in a formatted manner. 816 817 :param output_file: The `output_file` parameter is a string that specifies the path and filename 818 of the output file where the stats will be printed in Markdown format. If no `output_file` is 819 provided, a temporary directory will be created and the stats will be saved in a file named 820 "stats.md" within that 821 :type output_file: str 822 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 823 file where the statistics will be saved. If no value is provided, a temporary directory will be 824 created and a default file name "stats.json" will be used 825 :type json_file: str 826 :return: The function `print_stats` does not return any value. It has a return type annotation 827 of `None`. 828 """ 829 830 # Full path 831 output_file = full_path(output_file) 832 json_file = full_path(json_file) 833 834 with tempfile.TemporaryDirectory() as tmpdir: 835 836 # Files 837 if not output_file: 838 output_file = os.path.join(tmpdir, "stats.md") 839 if not json_file: 840 json_file = os.path.join(tmpdir, "stats.json") 841 842 # Create folders 843 if not os.path.exists(os.path.dirname(output_file)): 844 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 845 if not os.path.exists(os.path.dirname(json_file)): 846 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 847 848 # Create stats JSON file 849 stats_file = self.stats_to_file(file=json_file) 850 851 # Print stats file 852 with open(stats_file) as f: 853 stats = yaml.safe_load(f) 854 855 # Output 856 output_title = [] 857 output_index = [] 858 output = [] 859 860 # Title 861 output_title.append("# HOWARD Stats") 862 863 # Index 864 output_index.append("## Index") 865 866 # Process sections 867 for section in stats: 868 infos = stats.get(section) 869 section_link = "#" + section.lower().replace(" ", "-") 870 output.append(f"## {section}") 871 output_index.append(f"- [{section}]({section_link})") 872 873 if len(infos): 874 for info in infos: 875 try: 876 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 877 is_df = True 878 except: 879 try: 880 df = pd.DataFrame.from_dict( 881 json.loads((infos.get(info))), orient="index" 882 ) 883 is_df = True 884 except: 885 is_df = False 886 if is_df: 887 output.append(f"### {info}") 888 info_link = "#" + info.lower().replace(" ", "-") 889 output_index.append(f" - [{info}]({info_link})") 890 output.append(f"{df.to_markdown(index=False)}") 891 else: 892 output.append(f"- {info}: {infos.get(info)}") 893 else: 894 output.append(f"NA") 895 896 # Write stats in markdown file 897 with open(output_file, "w") as fp: 898 for item in output_title: 899 fp.write("%s\n" % item) 900 for item in output_index: 901 fp.write("%s\n" % item) 902 for item in output: 903 fp.write("%s\n" % item) 904 905 # Output stats in markdown 906 print("") 907 print("\n\n".join(output_title)) 908 print("") 909 print("\n\n".join(output)) 910 print("") 911 912 return None 913 914 def get_input(self) -> str: 915 """ 916 It returns the value of the input variable. 917 :return: The input is being returned. 918 """ 919 return self.input 920 921 def get_input_format(self, input_file: str = None) -> str: 922 """ 923 This function returns the format of the input variable, either from the provided input file or 924 by prompting for input. 925 926 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 927 represents the file path of the input file. If no `input_file` is provided when calling the 928 method, it will default to `None` 929 :type input_file: str 930 :return: The format of the input variable is being returned. 931 """ 932 933 if not input_file: 934 input_file = self.get_input() 935 input_format = get_file_format(input_file) 936 return input_format 937 938 def get_input_compressed(self, input_file: str = None) -> str: 939 """ 940 The function `get_input_compressed` returns the format of the input variable after compressing 941 it. 942 943 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 944 that represents the file path of the input file. If no `input_file` is provided when calling the 945 method, it will default to `None` and the method will then call `self.get_input()` to 946 :type input_file: str 947 :return: The function `get_input_compressed` returns the compressed format of the input 948 variable. 949 """ 950 951 if not input_file: 952 input_file = self.get_input() 953 input_compressed = get_file_compressed(input_file) 954 return input_compressed 955 956 def get_output(self) -> str: 957 """ 958 It returns the output of the neuron. 959 :return: The output of the neural network. 960 """ 961 962 return self.output 963 964 def get_output_format(self, output_file: str = None) -> str: 965 """ 966 The function `get_output_format` returns the format of the input variable or the output file if 967 provided. 968 969 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 970 that represents the file path of the output file. If no `output_file` is provided when calling 971 the method, it will default to the output obtained from the `get_output` method of the class 972 instance. The 973 :type output_file: str 974 :return: The format of the input variable is being returned. 975 """ 976 977 if not output_file: 978 output_file = self.get_output() 979 output_format = get_file_format(output_file) 980 981 return output_format 982 983 def get_config(self) -> dict: 984 """ 985 It returns the config 986 :return: The config variable is being returned. 987 """ 988 return self.config 989 990 def get_param(self) -> dict: 991 """ 992 It returns the param 993 :return: The param variable is being returned. 994 """ 995 return self.param 996 997 def get_connexion_db(self) -> str: 998 """ 999 It returns the connexion_db attribute of the object 1000 :return: The connexion_db is being returned. 1001 """ 1002 return self.connexion_db 1003 1004 def get_prefix(self) -> str: 1005 """ 1006 It returns the prefix of the object. 1007 :return: The prefix is being returned. 1008 """ 1009 return self.prefix 1010 1011 def get_table_variants(self, clause: str = "select") -> str: 1012 """ 1013 This function returns the table_variants attribute of the object 1014 1015 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1016 defaults to select (optional) 1017 :return: The table_variants attribute of the object. 1018 """ 1019 1020 # Access 1021 access = self.get_config().get("access", None) 1022 1023 # Clauses "select", "where", "update" 1024 if clause in ["select", "where", "update"]: 1025 table_variants = self.table_variants 1026 # Clause "from" 1027 elif clause in ["from"]: 1028 # For Read Only 1029 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1030 input_file = self.get_input() 1031 table_variants = f"'{input_file}' as variants" 1032 # For Read Write 1033 else: 1034 table_variants = f"{self.table_variants} as variants" 1035 else: 1036 table_variants = self.table_variants 1037 return table_variants 1038 1039 def get_tmp_dir(self) -> str: 1040 """ 1041 The function `get_tmp_dir` returns the temporary directory path based on configuration 1042 parameters or a default path. 1043 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1044 configuration, parameters, and a default value of "/tmp". 1045 """ 1046 1047 return get_tmp( 1048 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1049 ) 1050 1051 def get_connexion_type(self) -> str: 1052 """ 1053 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1054 1055 :return: The connexion type is being returned. 1056 """ 1057 return self.get_config().get("connexion_type", "memory") 1058 1059 def get_connexion(self): 1060 """ 1061 It returns the connection object 1062 1063 :return: The connection object. 1064 """ 1065 return self.conn 1066 1067 def close_connexion(self) -> None: 1068 """ 1069 This function closes the connection to the database. 1070 :return: The connection is being closed. 1071 """ 1072 return self.conn.close() 1073 1074 def get_header(self, type: str = "vcf"): 1075 """ 1076 This function returns the header of the VCF file as a list of strings 1077 1078 :param type: the type of header you want to get, defaults to vcf (optional) 1079 :return: The header of the vcf file. 1080 """ 1081 1082 if self.header_vcf: 1083 if type == "vcf": 1084 return self.header_vcf 1085 elif type == "list": 1086 return self.header_list 1087 else: 1088 if type == "vcf": 1089 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1090 return header 1091 elif type == "list": 1092 return vcf_required 1093 1094 def get_header_length(self, file: str = None) -> int: 1095 """ 1096 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1097 line. 1098 1099 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1100 header file. If this argument is provided, the function will read the header from the specified 1101 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1102 :type file: str 1103 :return: the length of the header list, excluding the #CHROM line. 1104 """ 1105 1106 if file: 1107 return len(self.read_vcf_header_file(file=file)) - 1 1108 elif self.get_header(type="list"): 1109 return len(self.get_header(type="list")) - 1 1110 else: 1111 return 0 1112 1113 def get_header_columns(self) -> str: 1114 """ 1115 This function returns the header list of a VCF 1116 1117 :return: The length of the header list. 1118 """ 1119 if self.get_header(): 1120 return self.get_header(type="list")[-1] 1121 else: 1122 return "" 1123 1124 def get_header_columns_as_list(self) -> list: 1125 """ 1126 This function returns the header list of a VCF 1127 1128 :return: The length of the header list. 1129 """ 1130 if self.get_header(): 1131 return self.get_header_columns().strip().split("\t") 1132 else: 1133 return [] 1134 1135 def get_header_columns_as_sql(self) -> str: 1136 """ 1137 This function retruns header length (without #CHROM line) 1138 1139 :return: The length of the header list. 1140 """ 1141 sql_column_list = [] 1142 for col in self.get_header_columns_as_list(): 1143 sql_column_list.append(f'"{col}"') 1144 return ",".join(sql_column_list) 1145 1146 def get_header_sample_list( 1147 self, check: bool = False, samples: list = None, samples_force: bool = False 1148 ) -> list: 1149 """ 1150 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1151 checking and filtering based on input parameters. 1152 1153 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1154 parameter that determines whether to check if the samples in the list are properly defined as 1155 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1156 list is defined as a, defaults to False 1157 :type check: bool (optional) 1158 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1159 allows you to specify a subset of samples from the header. If you provide a list of sample 1160 names, the function will check if each sample is defined in the header. If a sample is not found 1161 in the 1162 :type samples: list 1163 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1164 a boolean parameter that determines whether to force the function to return the sample list 1165 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1166 function will return the sample list without performing, defaults to False 1167 :type samples_force: bool (optional) 1168 :return: The function `get_header_sample_list` returns a list of samples based on the input 1169 parameters and conditions specified in the function. 1170 """ 1171 1172 # Init 1173 samples_list = [] 1174 1175 if samples is None: 1176 samples_list = self.header_vcf.samples 1177 else: 1178 samples_checked = [] 1179 for sample in samples: 1180 if sample in self.header_vcf.samples: 1181 samples_checked.append(sample) 1182 else: 1183 log.warning(f"Sample '{sample}' not defined in header") 1184 samples_list = samples_checked 1185 1186 # Force sample list without checking if is_genotype_column 1187 if samples_force: 1188 log.warning(f"Samples {samples_list} not checked if genotypes") 1189 return samples_list 1190 1191 if check: 1192 samples_checked = [] 1193 for sample in samples_list: 1194 if self.is_genotype_column(column=sample): 1195 samples_checked.append(sample) 1196 else: 1197 log.warning( 1198 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1199 ) 1200 samples_list = samples_checked 1201 1202 # Return samples list 1203 return samples_list 1204 1205 def is_genotype_column(self, column: str = None) -> bool: 1206 """ 1207 This function checks if a given column is a genotype column in a database. 1208 1209 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1210 represents the column name in a database table. This method checks if the specified column is a 1211 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1212 method of 1213 :type column: str 1214 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1215 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1216 column name and returns the result. If the `column` parameter is None, it returns False. 1217 """ 1218 1219 if column is not None: 1220 return Database(database=self.get_input()).is_genotype_column(column=column) 1221 else: 1222 return False 1223 1224 def get_verbose(self) -> bool: 1225 """ 1226 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1227 exist 1228 1229 :return: The value of the key "verbose" in the config dictionary. 1230 """ 1231 return self.get_config().get("verbose", False) 1232 1233 def get_connexion_format(self) -> str: 1234 """ 1235 It returns the connexion format of the object. 1236 :return: The connexion_format is being returned. 1237 """ 1238 connexion_format = self.connexion_format 1239 if connexion_format not in ["duckdb", "sqlite"]: 1240 log.error(f"Unknown connexion format {connexion_format}") 1241 raise ValueError(f"Unknown connexion format {connexion_format}") 1242 else: 1243 return connexion_format 1244 1245 def insert_file_to_table( 1246 self, 1247 file, 1248 columns: str, 1249 header_len: int = 0, 1250 sep: str = "\t", 1251 chunksize: int = 1000000, 1252 ) -> None: 1253 """ 1254 The function reads a file in chunks and inserts each chunk into a table based on the specified 1255 database format. 1256 1257 :param file: The `file` parameter is the file that you want to load into a table. It should be 1258 the path to the file on your system 1259 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1260 should contain the names of the columns in the table where the data will be inserted. The column 1261 names should be separated by commas within the string. For example, if you have columns named 1262 "id", "name 1263 :type columns: str 1264 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1265 the number of lines to skip at the beginning of the file before reading the actual data. This 1266 parameter allows you to skip any header information present in the file before processing the 1267 data, defaults to 0 1268 :type header_len: int (optional) 1269 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1270 separator character that is used in the file being read. In this case, the default separator is 1271 set to `\t`, which represents a tab character. You can change this parameter to a different 1272 separator character if, defaults to \t 1273 :type sep: str (optional) 1274 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1275 when processing the file in chunks. In the provided code snippet, the default value for 1276 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1277 to 1000000 1278 :type chunksize: int (optional) 1279 """ 1280 1281 # Config 1282 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1283 connexion_format = self.get_connexion_format() 1284 1285 log.debug("chunksize: " + str(chunksize)) 1286 1287 if chunksize: 1288 for chunk in pd.read_csv( 1289 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1290 ): 1291 if connexion_format in ["duckdb"]: 1292 sql_insert_into = ( 1293 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1294 ) 1295 self.conn.execute(sql_insert_into) 1296 elif connexion_format in ["sqlite"]: 1297 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1298 1299 def load_data( 1300 self, 1301 input_file: str = None, 1302 drop_variants_table: bool = False, 1303 sample_size: int = 20480, 1304 ) -> None: 1305 """ 1306 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1307 table before loading the data and specify a sample size. 1308 1309 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1310 table 1311 :type input_file: str 1312 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1313 determines whether the variants table should be dropped before loading the data. If set to 1314 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1315 not be dropped, defaults to False 1316 :type drop_variants_table: bool (optional) 1317 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1318 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1319 20480 1320 :type sample_size: int (optional) 1321 """ 1322 1323 log.info("Loading...") 1324 1325 # change input file 1326 if input_file: 1327 self.set_input(input_file) 1328 self.set_header() 1329 1330 # drop variants table 1331 if drop_variants_table: 1332 self.drop_variants_table() 1333 1334 # get table variants 1335 table_variants = self.get_table_variants() 1336 1337 # Access 1338 access = self.get_config().get("access", None) 1339 log.debug(f"access: {access}") 1340 1341 # Input format and compress 1342 input_format = self.get_input_format() 1343 input_compressed = self.get_input_compressed() 1344 log.debug(f"input_format: {input_format}") 1345 log.debug(f"input_compressed: {input_compressed}") 1346 1347 # input_compressed_format 1348 if input_compressed: 1349 input_compressed_format = "gzip" 1350 else: 1351 input_compressed_format = "none" 1352 log.debug(f"input_compressed_format: {input_compressed_format}") 1353 1354 # Connexion format 1355 connexion_format = self.get_connexion_format() 1356 1357 # Sample size 1358 if not sample_size: 1359 sample_size = -1 1360 log.debug(f"sample_size: {sample_size}") 1361 1362 # Load data 1363 log.debug(f"Load Data from {input_format}") 1364 1365 # DuckDB connexion 1366 if connexion_format in ["duckdb"]: 1367 1368 # Database already exists 1369 if self.input_format in ["db", "duckdb"]: 1370 1371 if connexion_format in ["duckdb"]: 1372 log.debug(f"Input file format '{self.input_format}' duckDB") 1373 else: 1374 log.error( 1375 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1376 ) 1377 raise ValueError( 1378 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1379 ) 1380 1381 # Load from existing database format 1382 else: 1383 1384 try: 1385 # Create Table or View 1386 database = Database(database=self.input) 1387 sql_from = database.get_sql_from(sample_size=sample_size) 1388 1389 if access in ["RO"]: 1390 sql_load = ( 1391 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1392 ) 1393 else: 1394 sql_load = ( 1395 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1396 ) 1397 self.conn.execute(sql_load) 1398 1399 except: 1400 # Format not available 1401 log.error(f"Input file format '{self.input_format}' not available") 1402 raise ValueError( 1403 f"Input file format '{self.input_format}' not available" 1404 ) 1405 1406 # SQLite connexion 1407 elif connexion_format in ["sqlite"] and input_format in [ 1408 "vcf", 1409 "tsv", 1410 "csv", 1411 "psv", 1412 ]: 1413 1414 # Main structure 1415 structure = { 1416 "#CHROM": "VARCHAR", 1417 "POS": "INTEGER", 1418 "ID": "VARCHAR", 1419 "REF": "VARCHAR", 1420 "ALT": "VARCHAR", 1421 "QUAL": "VARCHAR", 1422 "FILTER": "VARCHAR", 1423 "INFO": "VARCHAR", 1424 } 1425 1426 # Strcuture with samples 1427 structure_complete = structure 1428 if self.get_header_sample_list(): 1429 structure["FORMAT"] = "VARCHAR" 1430 for sample in self.get_header_sample_list(): 1431 structure_complete[sample] = "VARCHAR" 1432 1433 # Columns list for create and insert 1434 sql_create_table_columns = [] 1435 sql_create_table_columns_list = [] 1436 for column in structure_complete: 1437 column_type = structure_complete[column] 1438 sql_create_table_columns.append( 1439 f'"{column}" {column_type} default NULL' 1440 ) 1441 sql_create_table_columns_list.append(f'"{column}"') 1442 1443 # Create database 1444 log.debug(f"Create Table {table_variants}") 1445 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1446 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1447 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1448 self.conn.execute(sql_create_table) 1449 1450 # chunksize define length of file chunk load file 1451 chunksize = 100000 1452 1453 # delimiter 1454 delimiter = file_format_delimiters.get(input_format, "\t") 1455 1456 # Load the input file 1457 with open(self.input, "rt") as input_file: 1458 1459 # Use the appropriate file handler based on the input format 1460 if input_compressed: 1461 input_file = bgzf.open(self.input, "rt") 1462 if input_format in ["vcf"]: 1463 header_len = self.get_header_length() 1464 else: 1465 header_len = 0 1466 1467 # Insert the file contents into a table 1468 self.insert_file_to_table( 1469 input_file, 1470 columns=sql_create_table_columns_list_sql, 1471 header_len=header_len, 1472 sep=delimiter, 1473 chunksize=chunksize, 1474 ) 1475 1476 else: 1477 log.error( 1478 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1479 ) 1480 raise ValueError( 1481 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1482 ) 1483 1484 # Explode INFOS fields into table fields 1485 if self.get_explode_infos(): 1486 self.explode_infos( 1487 prefix=self.get_explode_infos_prefix(), 1488 fields=self.get_explode_infos_fields(), 1489 force=True, 1490 ) 1491 1492 # Create index after insertion 1493 self.create_indexes() 1494 1495 def get_explode_infos(self) -> bool: 1496 """ 1497 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1498 to False if it is not set. 1499 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1500 value. If the parameter is not present, it will return False. 1501 """ 1502 1503 return self.get_param().get("explode", {}).get("explode_infos", False) 1504 1505 def get_explode_infos_fields( 1506 self, 1507 explode_infos_fields: str = None, 1508 remove_fields_not_in_header: bool = False, 1509 ) -> list: 1510 """ 1511 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1512 the input parameter `explode_infos_fields`. 1513 1514 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1515 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1516 comma-separated list of field names to explode 1517 :type explode_infos_fields: str 1518 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1519 flag that determines whether to remove fields that are not present in the header. If it is set 1520 to `True`, any field that is not in the header will be excluded from the list of exploded 1521 information fields. If it is set to `, defaults to False 1522 :type remove_fields_not_in_header: bool (optional) 1523 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1524 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1525 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1526 Otherwise, it returns a list of exploded information fields after removing any spaces and 1527 splitting the string by commas. 1528 """ 1529 1530 # If no fields, get it in param 1531 if not explode_infos_fields: 1532 explode_infos_fields = ( 1533 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1534 ) 1535 1536 # If no fields, defined as all fields in header using keyword 1537 if not explode_infos_fields: 1538 explode_infos_fields = "*" 1539 1540 # If fields list not empty 1541 if explode_infos_fields: 1542 1543 # Input fields list 1544 if isinstance(explode_infos_fields, str): 1545 fields_input = explode_infos_fields.split(",") 1546 elif isinstance(explode_infos_fields, list): 1547 fields_input = explode_infos_fields 1548 else: 1549 fields_input = [] 1550 1551 # Fields list without * keyword 1552 fields_without_all = fields_input.copy() 1553 if "*".casefold() in (item.casefold() for item in fields_without_all): 1554 fields_without_all.remove("*") 1555 1556 # Fields in header 1557 fields_in_header = sorted(list(set(self.get_header().infos))) 1558 1559 # Construct list of fields 1560 fields_output = [] 1561 for field in fields_input: 1562 1563 # Strip field 1564 field = field.strip() 1565 1566 # format keyword * in regex 1567 if field.upper() in ["*"]: 1568 field = ".*" 1569 1570 # Find all fields with pattern 1571 r = re.compile(field) 1572 fields_search = sorted(list(filter(r.match, fields_in_header))) 1573 1574 # Remove fields input from search 1575 if field in fields_search: 1576 fields_search = [field] 1577 elif fields_search != [field]: 1578 fields_search = sorted( 1579 list(set(fields_search).difference(fields_input)) 1580 ) 1581 1582 # If field is not in header (avoid not well formatted header) 1583 if not fields_search and not remove_fields_not_in_header: 1584 fields_search = [field] 1585 1586 # Add found fields 1587 for new_field in fields_search: 1588 # Add field, if not already exists, and if it is in header (if asked) 1589 if ( 1590 new_field not in fields_output 1591 and ( 1592 not remove_fields_not_in_header 1593 or new_field in fields_in_header 1594 ) 1595 and new_field not in [".*"] 1596 ): 1597 fields_output.append(new_field) 1598 1599 return fields_output 1600 1601 else: 1602 1603 return [] 1604 1605 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1606 """ 1607 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1608 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1609 not provided. 1610 1611 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1612 prefix to be used for exploding or expanding information 1613 :type explode_infos_prefix: str 1614 :return: the value of the variable `explode_infos_prefix`. 1615 """ 1616 1617 if not explode_infos_prefix: 1618 explode_infos_prefix = ( 1619 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1620 ) 1621 1622 return explode_infos_prefix 1623 1624 def add_column( 1625 self, 1626 table_name, 1627 column_name, 1628 column_type, 1629 default_value=None, 1630 drop: bool = False, 1631 ) -> dict: 1632 """ 1633 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1634 doesn't already exist. 1635 1636 :param table_name: The name of the table to which you want to add a column 1637 :param column_name: The parameter "column_name" is the name of the column that you want to add 1638 to the table 1639 :param column_type: The `column_type` parameter specifies the data type of the column that you 1640 want to add to the table. It should be a string that represents the desired data type, such as 1641 "INTEGER", "TEXT", "REAL", etc 1642 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1643 default value for the newly added column. If a default value is provided, it will be assigned to 1644 the column for any existing rows that do not have a value for that column 1645 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1646 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1647 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1648 to False 1649 :type drop: bool (optional) 1650 :return: a boolean value indicating whether the column was successfully added to the table. 1651 """ 1652 1653 # added 1654 added = False 1655 dropped = False 1656 1657 # Check if the column already exists in the table 1658 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1659 columns = self.get_query_to_df(query).columns.tolist() 1660 if column_name.upper() in [c.upper() for c in columns]: 1661 log.debug( 1662 f"The {column_name} column already exists in the {table_name} table" 1663 ) 1664 if drop: 1665 self.drop_column(table_name=table_name, column_name=column_name) 1666 dropped = True 1667 else: 1668 return None 1669 else: 1670 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1671 1672 # Add column in table 1673 add_column_query = ( 1674 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1675 ) 1676 if default_value is not None: 1677 add_column_query += f" DEFAULT {default_value}" 1678 self.execute_query(add_column_query) 1679 added = not dropped 1680 log.debug( 1681 f"The {column_name} column was successfully added to the {table_name} table" 1682 ) 1683 1684 if added: 1685 added_column = { 1686 "table_name": table_name, 1687 "column_name": column_name, 1688 "column_type": column_type, 1689 "default_value": default_value, 1690 } 1691 else: 1692 added_column = None 1693 1694 return added_column 1695 1696 def drop_column( 1697 self, column: dict = None, table_name: str = None, column_name: str = None 1698 ) -> bool: 1699 """ 1700 The `drop_column` function drops a specified column from a given table in a database and returns 1701 True if the column was successfully dropped, and False if the column does not exist in the 1702 table. 1703 1704 :param column: The `column` parameter is a dictionary that contains information about the column 1705 you want to drop. It has two keys: 1706 :type column: dict 1707 :param table_name: The `table_name` parameter is the name of the table from which you want to 1708 drop a column 1709 :type table_name: str 1710 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1711 from the table 1712 :type column_name: str 1713 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1714 and False if the column does not exist in the table. 1715 """ 1716 1717 # Find column infos 1718 if column: 1719 if isinstance(column, dict): 1720 table_name = column.get("table_name", None) 1721 column_name = column.get("column_name", None) 1722 elif isinstance(column, str): 1723 table_name = self.get_table_variants() 1724 column_name = column 1725 else: 1726 table_name = None 1727 column_name = None 1728 1729 if not table_name and not column_name: 1730 return False 1731 1732 # Removed 1733 removed = False 1734 1735 # Check if the column already exists in the table 1736 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1737 columns = self.get_query_to_df(query).columns.tolist() 1738 if column_name in columns: 1739 log.debug(f"The {column_name} column exists in the {table_name} table") 1740 else: 1741 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1742 return False 1743 1744 # Add column in table # ALTER TABLE integers DROP k 1745 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1746 self.execute_query(add_column_query) 1747 removed = True 1748 log.debug( 1749 f"The {column_name} column was successfully dropped to the {table_name} table" 1750 ) 1751 1752 return removed 1753 1754 def explode_infos( 1755 self, 1756 prefix: str = None, 1757 create_index: bool = False, 1758 fields: list = None, 1759 force: bool = False, 1760 proccess_all_fields_together: bool = False, 1761 table: str = None, 1762 ) -> list: 1763 """ 1764 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1765 individual columns, returning a list of added columns. 1766 1767 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1768 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1769 `self.get_explode_infos_prefix()` as the prefix 1770 :type prefix: str 1771 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1772 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1773 `False`, indexes will not be created. The default value is `False`, defaults to False 1774 :type create_index: bool (optional) 1775 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1776 that you want to explode into individual columns. If this parameter is not provided, all INFO 1777 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1778 a list to the ` 1779 :type fields: list 1780 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1781 determines whether to drop and recreate a column if it already exists in the table. If `force` 1782 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1783 defaults to False 1784 :type force: bool (optional) 1785 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1786 flag that determines whether to process all the INFO fields together or individually. If set to 1787 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1788 be processed individually. The default value is, defaults to False 1789 :type proccess_all_fields_together: bool (optional) 1790 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1791 of the table where the exploded INFO fields will be added as individual columns. If you provide 1792 a value for the `table` parameter, the function will use that table name. If the `table` 1793 parameter is 1794 :type table: str 1795 :return: The `explode_infos` function returns a list of added columns. 1796 """ 1797 1798 # drop indexes 1799 self.drop_indexes() 1800 1801 # connexion format 1802 connexion_format = self.get_connexion_format() 1803 1804 # Access 1805 access = self.get_config().get("access", None) 1806 1807 # Added columns 1808 added_columns = [] 1809 1810 if access not in ["RO"]: 1811 1812 # prefix 1813 if prefix in [None, True] or not isinstance(prefix, str): 1814 if self.get_explode_infos_prefix() not in [None, True]: 1815 prefix = self.get_explode_infos_prefix() 1816 else: 1817 prefix = "INFO/" 1818 1819 # table variants 1820 if table is not None: 1821 table_variants = table 1822 else: 1823 table_variants = self.get_table_variants(clause="select") 1824 1825 # extra infos 1826 try: 1827 extra_infos = self.get_extra_infos() 1828 except: 1829 extra_infos = [] 1830 1831 # Header infos 1832 header_infos = self.get_header().infos 1833 1834 log.debug( 1835 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1836 ) 1837 1838 sql_info_alter_table_array = [] 1839 1840 # Info fields to check 1841 fields_list = list(header_infos) 1842 if fields: 1843 fields_list += fields 1844 fields_list = set(fields_list) 1845 1846 # If no fields 1847 if not fields: 1848 fields = [] 1849 1850 # Translate fields if patterns 1851 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1852 1853 for info in fields: 1854 1855 info_id_sql = prefix + info 1856 1857 if ( 1858 info in fields_list 1859 or prefix + info in fields_list 1860 or info in extra_infos 1861 ): 1862 1863 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1864 1865 if info in header_infos: 1866 info_type = header_infos[info].type 1867 info_num = header_infos[info].num 1868 else: 1869 info_type = "String" 1870 info_num = 0 1871 1872 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1873 if info_num != 1: 1874 type_sql = "VARCHAR" 1875 1876 # Add field 1877 added_column = self.add_column( 1878 table_name=table_variants, 1879 column_name=info_id_sql, 1880 column_type=type_sql, 1881 default_value="null", 1882 drop=force, 1883 ) 1884 1885 if added_column: 1886 added_columns.append(added_column) 1887 1888 if added_column or force: 1889 1890 # add field to index 1891 self.index_additionnal_fields.append(info_id_sql) 1892 1893 # Update field array 1894 if connexion_format in ["duckdb"]: 1895 update_info_field = f""" 1896 "{info_id_sql}" = 1897 CASE 1898 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1899 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1900 END 1901 """ 1902 elif connexion_format in ["sqlite"]: 1903 update_info_field = f""" 1904 "{info_id_sql}" = 1905 CASE 1906 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1907 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1908 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1909 END 1910 """ 1911 1912 sql_info_alter_table_array.append(update_info_field) 1913 1914 if sql_info_alter_table_array: 1915 1916 # By chromosomes 1917 try: 1918 chromosomes_list = list( 1919 self.get_query_to_df( 1920 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1921 )["#CHROM"] 1922 ) 1923 except: 1924 chromosomes_list = [None] 1925 1926 for chrom in chromosomes_list: 1927 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1928 1929 # Where clause 1930 where_clause = "" 1931 if chrom and len(chromosomes_list) > 1: 1932 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1933 1934 # Update table 1935 if proccess_all_fields_together: 1936 sql_info_alter_table_array_join = ", ".join( 1937 sql_info_alter_table_array 1938 ) 1939 if sql_info_alter_table_array_join: 1940 sql_info_alter_table = f""" 1941 UPDATE {table_variants} 1942 SET {sql_info_alter_table_array_join} 1943 {where_clause} 1944 """ 1945 log.debug( 1946 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1947 ) 1948 # log.debug(sql_info_alter_table) 1949 self.conn.execute(sql_info_alter_table) 1950 else: 1951 sql_info_alter_num = 0 1952 for sql_info_alter in sql_info_alter_table_array: 1953 sql_info_alter_num += 1 1954 sql_info_alter_table = f""" 1955 UPDATE {table_variants} 1956 SET {sql_info_alter} 1957 {where_clause} 1958 """ 1959 log.debug( 1960 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1961 ) 1962 # log.debug(sql_info_alter_table) 1963 self.conn.execute(sql_info_alter_table) 1964 1965 # create indexes 1966 if create_index: 1967 self.create_indexes() 1968 1969 return added_columns 1970 1971 def create_indexes(self) -> None: 1972 """ 1973 Create indexes on the table after insertion 1974 """ 1975 1976 # Access 1977 access = self.get_config().get("access", None) 1978 1979 # get table variants 1980 table_variants = self.get_table_variants("FROM") 1981 1982 if self.get_indexing() and access not in ["RO"]: 1983 # Create index 1984 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1985 self.conn.execute(sql_create_table_index) 1986 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1987 self.conn.execute(sql_create_table_index) 1988 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1989 self.conn.execute(sql_create_table_index) 1990 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1991 self.conn.execute(sql_create_table_index) 1992 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1993 self.conn.execute(sql_create_table_index) 1994 for field in self.index_additionnal_fields: 1995 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1996 self.conn.execute(sql_create_table_index) 1997 1998 def drop_indexes(self) -> None: 1999 """ 2000 Create indexes on the table after insertion 2001 """ 2002 2003 # Access 2004 access = self.get_config().get("access", None) 2005 2006 # get table variants 2007 table_variants = self.get_table_variants("FROM") 2008 2009 # Get database format 2010 connexion_format = self.get_connexion_format() 2011 2012 if access not in ["RO"]: 2013 if connexion_format in ["duckdb"]: 2014 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2015 elif connexion_format in ["sqlite"]: 2016 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2017 2018 list_indexes = self.conn.execute(sql_list_indexes) 2019 index_names = [row[0] for row in list_indexes.fetchall()] 2020 for index in index_names: 2021 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2022 self.conn.execute(sql_drop_table_index) 2023 2024 def read_vcf_header(self, f) -> list: 2025 """ 2026 It reads the header of a VCF file and returns a list of the header lines 2027 2028 :param f: the file object 2029 :return: The header lines of the VCF file. 2030 """ 2031 2032 header_list = [] 2033 for line in f: 2034 header_list.append(line) 2035 if line.startswith("#CHROM"): 2036 break 2037 return header_list 2038 2039 def read_vcf_header_file(self, file: str = None) -> list: 2040 """ 2041 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2042 uncompressed files. 2043 2044 :param file: The `file` parameter is a string that represents the path to the VCF header file 2045 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2046 default to `None` 2047 :type file: str 2048 :return: The function `read_vcf_header_file` returns a list. 2049 """ 2050 2051 if self.get_input_compressed(input_file=file): 2052 with bgzf.open(file, "rt") as f: 2053 return self.read_vcf_header(f=f) 2054 else: 2055 with open(file, "rt") as f: 2056 return self.read_vcf_header(f=f) 2057 2058 def execute_query(self, query: str): 2059 """ 2060 It takes a query as an argument, executes it, and returns the results 2061 2062 :param query: The query to be executed 2063 :return: The result of the query is being returned. 2064 """ 2065 if query: 2066 return self.conn.execute(query) # .fetchall() 2067 else: 2068 return None 2069 2070 def export_output( 2071 self, 2072 output_file: str | None = None, 2073 output_header: str | None = None, 2074 export_header: bool = True, 2075 query: str | None = None, 2076 parquet_partitions: list | None = None, 2077 chunk_size: int | None = None, 2078 threads: int | None = None, 2079 sort: bool = False, 2080 index: bool = False, 2081 order_by: str | None = None, 2082 ) -> bool: 2083 """ 2084 The `export_output` function exports data from a VCF file to a specified output file in various 2085 formats, including VCF, CSV, TSV, PSV, and Parquet. 2086 2087 :param output_file: The `output_file` parameter is a string that specifies the name of the 2088 output file to be generated by the function. This is where the exported data will be saved 2089 :type output_file: str 2090 :param output_header: The `output_header` parameter is a string that specifies the name of the 2091 file where the header of the VCF file will be exported. If this parameter is not provided, the 2092 header will be exported to a file with the same name as the `output_file` parameter, but with 2093 the extension " 2094 :type output_header: str 2095 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2096 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2097 True, the header will be exported to a file. If `export_header` is False, the header will not 2098 be, defaults to True, if output format is not VCF 2099 :type export_header: bool (optional) 2100 :param query: The `query` parameter is an optional SQL query that can be used to filter and 2101 select specific data from the VCF file before exporting it. If provided, only the data that 2102 matches the query will be exported 2103 :type query: str 2104 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2105 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2106 organize data in a hierarchical directory structure based on the values of one or more columns. 2107 This can improve query performance when working with large datasets 2108 :type parquet_partitions: list 2109 :param chunk_size: The `chunk_size` parameter specifies the number of 2110 records in batch when exporting data in Parquet format. This parameter is used for 2111 partitioning the Parquet file into multiple files. 2112 :type chunk_size: int 2113 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2114 threads to be used during the export process. It determines the level of parallelism and can 2115 improve the performance of the export operation. If not provided, the function will use the 2116 default number of threads 2117 :type threads: int 2118 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2119 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2120 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2121 False 2122 :type sort: bool (optional) 2123 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2124 created on the output file. If `index` is True, an index will be created. If `index` is False, 2125 no index will be created. The default value is False, defaults to False 2126 :type index: bool (optional) 2127 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2128 sorting the output file. This parameter is only applicable when exporting data in VCF format 2129 :type order_by: str 2130 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2131 None if it doesn't. 2132 """ 2133 2134 # Log 2135 log.info("Exporting...") 2136 2137 # Full path 2138 output_file = full_path(output_file) 2139 output_header = full_path(output_header) 2140 2141 # Config 2142 config = self.get_config() 2143 2144 # Param 2145 param = self.get_param() 2146 2147 # Tmp files to remove 2148 tmp_to_remove = [] 2149 2150 # If no output, get it 2151 if not output_file: 2152 output_file = self.get_output() 2153 2154 # If not threads 2155 if not threads: 2156 threads = self.get_threads() 2157 2158 # Auto header name with extension 2159 if export_header or output_header: 2160 if not output_header: 2161 output_header = f"{output_file}.hdr" 2162 # Export header 2163 self.export_header(output_file=output_file) 2164 2165 # Switch off export header if VCF output 2166 output_file_type = get_file_format(output_file) 2167 if output_file_type in ["vcf"]: 2168 export_header = False 2169 tmp_to_remove.append(output_header) 2170 2171 # Chunk size 2172 if not chunk_size: 2173 chunk_size = config.get("chunk_size", None) 2174 2175 # Parquet partition 2176 if not parquet_partitions: 2177 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2178 if parquet_partitions and isinstance(parquet_partitions, str): 2179 parquet_partitions = parquet_partitions.split(",") 2180 2181 # Order by 2182 if not order_by: 2183 order_by = param.get("export", {}).get("order_by", "") 2184 2185 # Header in output 2186 header_in_output = param.get("export", {}).get("include_header", False) 2187 2188 # Database 2189 database_source = self.get_connexion() 2190 2191 # Connexion format 2192 connexion_format = self.get_connexion_format() 2193 2194 # Explode infos 2195 if self.get_explode_infos(): 2196 self.explode_infos( 2197 prefix=self.get_explode_infos_prefix(), 2198 fields=self.get_explode_infos_fields(), 2199 force=False, 2200 ) 2201 2202 # if connexion_format in ["sqlite"] or query: 2203 if connexion_format in ["sqlite"]: 2204 2205 # Export in Parquet 2206 random_tmp = "".join( 2207 random.choice(string.ascii_lowercase) for i in range(10) 2208 ) 2209 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2210 tmp_to_remove.append(database_source) 2211 2212 # Table Variants 2213 table_variants = self.get_table_variants() 2214 2215 # Create export query 2216 sql_query_export_subquery = f""" 2217 SELECT * FROM {table_variants} 2218 """ 2219 2220 # Write source file 2221 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2222 2223 # Create database 2224 database = Database( 2225 database=database_source, 2226 table="variants", 2227 header_file=output_header, 2228 conn_config=self.get_connexion_config(), 2229 ) 2230 2231 # Existing colomns header 2232 existing_columns_header = database.get_header_columns_from_database() 2233 2234 # Sample list 2235 if output_file_type in ["vcf"]: 2236 get_samples = self.get_samples() 2237 get_samples_check = self.get_samples_check() 2238 samples_force = get_samples is not None 2239 sample_list = self.get_header_sample_list( 2240 check=get_samples_check, 2241 samples=get_samples, 2242 samples_force=samples_force, 2243 ) 2244 else: 2245 sample_list = None 2246 2247 # Export file 2248 database.export( 2249 output_database=output_file, 2250 output_header=output_header, 2251 existing_columns_header=existing_columns_header, 2252 parquet_partitions=parquet_partitions, 2253 chunk_size=chunk_size, 2254 threads=threads, 2255 sort=sort, 2256 index=index, 2257 header_in_output=header_in_output, 2258 order_by=order_by, 2259 query=query, 2260 export_header=export_header, 2261 sample_list=sample_list, 2262 ) 2263 2264 # Remove 2265 remove_if_exists(tmp_to_remove) 2266 2267 return (os.path.exists(output_file) or None) and ( 2268 os.path.exists(output_file) or None 2269 ) 2270 2271 def get_extra_infos(self, table: str = None) -> list: 2272 """ 2273 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2274 in the header. 2275 2276 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2277 name of the table from which you want to retrieve the extra columns that are not present in the 2278 header. If the `table` parameter is not provided when calling the function, it will default to 2279 using the variants 2280 :type table: str 2281 :return: A list of columns that are in the specified table but not in the header of the table. 2282 """ 2283 2284 header_columns = [] 2285 2286 if not table: 2287 table = self.get_table_variants(clause="from") 2288 header_columns = self.get_header_columns() 2289 2290 # Check all columns in the database 2291 query = f""" SELECT * FROM {table} LIMIT 1 """ 2292 log.debug(f"query {query}") 2293 table_columns = self.get_query_to_df(query).columns.tolist() 2294 extra_columns = [] 2295 2296 # Construct extra infos (not in header) 2297 for column in table_columns: 2298 if column not in header_columns: 2299 extra_columns.append(column) 2300 2301 return extra_columns 2302 2303 def get_extra_infos_sql(self, table: str = None) -> str: 2304 """ 2305 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2306 by double quotes 2307 2308 :param table: The name of the table to get the extra infos from. If None, the default table is 2309 used 2310 :type table: str 2311 :return: A string of the extra infos 2312 """ 2313 2314 return ", ".join( 2315 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2316 ) 2317 2318 def export_header( 2319 self, 2320 header_name: str = None, 2321 output_file: str = None, 2322 output_file_ext: str = ".hdr", 2323 clean_header: bool = True, 2324 remove_chrom_line: bool = False, 2325 ) -> str: 2326 """ 2327 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2328 specified options, and writes it to a new file. 2329 2330 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2331 this parameter is not specified, the header will be written to the output file 2332 :type header_name: str 2333 :param output_file: The `output_file` parameter in the `export_header` function is used to 2334 specify the name of the output file where the header will be written. If this parameter is not 2335 provided, the header will be written to a temporary file 2336 :type output_file: str 2337 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2338 string that represents the extension of the output header file. By default, it is set to ".hdr" 2339 if not specified by the user. This extension will be appended to the `output_file` name to 2340 create the final, defaults to .hdr 2341 :type output_file_ext: str (optional) 2342 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2343 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2344 `True`, the function will clean the header by modifying certain lines based on a specific 2345 pattern. If `clean_header`, defaults to True 2346 :type clean_header: bool (optional) 2347 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2348 boolean flag that determines whether the #CHROM line should be removed from the header before 2349 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2350 defaults to False 2351 :type remove_chrom_line: bool (optional) 2352 :return: The function `export_header` returns the name of the temporary header file that is 2353 created. 2354 """ 2355 2356 if not header_name and not output_file: 2357 output_file = self.get_output() 2358 2359 if self.get_header(): 2360 2361 # Get header object 2362 header_obj = self.get_header() 2363 2364 # Create database 2365 db_for_header = Database(database=self.get_input()) 2366 2367 # Get real columns in the file 2368 db_header_columns = db_for_header.get_columns() 2369 2370 with tempfile.TemporaryDirectory() as tmpdir: 2371 2372 # Write header file 2373 header_file_tmp = os.path.join(tmpdir, "header") 2374 f = open(header_file_tmp, "w") 2375 vcf.Writer(f, header_obj) 2376 f.close() 2377 2378 # Replace #CHROM line with rel columns 2379 header_list = db_for_header.read_header_file( 2380 header_file=header_file_tmp 2381 ) 2382 header_list[-1] = "\t".join(db_header_columns) 2383 2384 # Remove CHROM line 2385 if remove_chrom_line: 2386 header_list.pop() 2387 2388 # Clean header 2389 if clean_header: 2390 header_list_clean = [] 2391 for head in header_list: 2392 # Clean head for malformed header 2393 head_clean = head 2394 head_clean = re.subn( 2395 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2396 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2397 head_clean, 2398 2, 2399 )[0] 2400 # Write header 2401 header_list_clean.append(head_clean) 2402 header_list = header_list_clean 2403 2404 tmp_header_name = output_file + output_file_ext 2405 2406 f = open(tmp_header_name, "w") 2407 for line in header_list: 2408 f.write(line) 2409 f.close() 2410 2411 return tmp_header_name 2412 2413 def export_variant_vcf( 2414 self, 2415 vcf_file, 2416 remove_info: bool = False, 2417 add_samples: bool = True, 2418 list_samples: list = [], 2419 where_clause: str = "", 2420 index: bool = False, 2421 threads: int | None = None, 2422 ) -> bool | None: 2423 """ 2424 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2425 remove INFO field, add samples, and control compression and indexing. 2426 2427 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2428 written to. It is the output file that will contain the filtered VCF data based on the specified 2429 parameters 2430 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2431 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2432 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2433 in, defaults to False 2434 :type remove_info: bool (optional) 2435 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2436 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2437 If set to False, the samples will be removed. The default value is True, defaults to True 2438 :type add_samples: bool (optional) 2439 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2440 in the output VCF file. By default, all samples will be included. If you provide a list of 2441 samples, only those samples will be included in the output file 2442 :type list_samples: list 2443 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2444 determines whether or not to create an index for the output VCF file. If `index` is set to 2445 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2446 :type index: bool (optional) 2447 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2448 number of threads to use for exporting the VCF file. It determines how many parallel threads 2449 will be used during the export process. More threads can potentially speed up the export process 2450 by utilizing multiple cores of the processor. If 2451 :type threads: int | None 2452 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2453 method with various parameters including the output file, query, threads, sort flag, and index 2454 flag. The `export_output` method is responsible for exporting the VCF data based on the 2455 specified parameters and configurations provided in the `export_variant_vcf` function. 2456 """ 2457 2458 # Config 2459 config = self.get_config() 2460 2461 # Extract VCF 2462 log.debug("Export VCF...") 2463 2464 # Table variants 2465 table_variants = self.get_table_variants() 2466 2467 # Threads 2468 if not threads: 2469 threads = self.get_threads() 2470 2471 # Info fields 2472 if remove_info: 2473 if not isinstance(remove_info, str): 2474 remove_info = "." 2475 info_field = f"""'{remove_info}' as INFO""" 2476 else: 2477 info_field = "INFO" 2478 2479 # Samples fields 2480 if add_samples: 2481 if not list_samples: 2482 list_samples = self.get_header_sample_list() 2483 if list_samples: 2484 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2485 else: 2486 samples_fields = "" 2487 log.debug(f"samples_fields: {samples_fields}") 2488 else: 2489 samples_fields = "" 2490 2491 # Where clause 2492 if where_clause is None: 2493 where_clause = "" 2494 2495 # Variants 2496 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2497 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2498 log.debug(f"sql_query_select={sql_query_select}") 2499 2500 return self.export_output( 2501 output_file=vcf_file, 2502 output_header=None, 2503 export_header=True, 2504 query=sql_query_select, 2505 parquet_partitions=None, 2506 chunk_size=config.get("chunk_size", None), 2507 threads=threads, 2508 sort=True, 2509 index=index, 2510 order_by=None, 2511 ) 2512 2513 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2514 """ 2515 It takes a list of commands and runs them in parallel using the number of threads specified 2516 2517 :param commands: A list of commands to run 2518 :param threads: The number of threads to use, defaults to 1 (optional) 2519 """ 2520 2521 run_parallel_commands(commands, threads) 2522 2523 def get_threads(self, default: int = 1) -> int: 2524 """ 2525 This function returns the number of threads to use for a job, with a default value of 1 if not 2526 specified. 2527 2528 :param default: The `default` parameter in the `get_threads` method is used to specify the 2529 default number of threads to use if no specific value is provided. If no value is provided for 2530 the `threads` parameter in the configuration or input parameters, the `default` value will be 2531 used, defaults to 1 2532 :type default: int (optional) 2533 :return: the number of threads to use for the current job. 2534 """ 2535 2536 # Config 2537 config = self.get_config() 2538 2539 # Param 2540 param = self.get_param() 2541 2542 # Input threads 2543 input_thread = param.get("threads", config.get("threads", None)) 2544 2545 # Check threads 2546 if not input_thread: 2547 threads = default 2548 elif int(input_thread) <= 0: 2549 threads = os.cpu_count() 2550 else: 2551 threads = int(input_thread) 2552 return threads 2553 2554 def get_memory(self, default: str = None) -> str: 2555 """ 2556 This function retrieves the memory value from parameters or configuration with a default value 2557 if not found. 2558 2559 :param default: The `get_memory` function takes in a default value as a string parameter. This 2560 default value is used as a fallback in case the `memory` parameter is not provided in the 2561 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2562 the function 2563 :type default: str 2564 :return: The `get_memory` function returns a string value representing the memory parameter. If 2565 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2566 return the default value provided as an argument to the function. 2567 """ 2568 2569 # Config 2570 config = self.get_config() 2571 2572 # Param 2573 param = self.get_param() 2574 2575 # Input threads 2576 input_memory = param.get("memory", config.get("memory", None)) 2577 2578 # Check threads 2579 if input_memory: 2580 memory = input_memory 2581 else: 2582 memory = default 2583 2584 return memory 2585 2586 def update_from_vcf(self, vcf_file: str) -> None: 2587 """ 2588 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2589 2590 :param vcf_file: the path to the VCF file 2591 """ 2592 2593 connexion_format = self.get_connexion_format() 2594 2595 if connexion_format in ["duckdb"]: 2596 self.update_from_vcf_duckdb(vcf_file) 2597 elif connexion_format in ["sqlite"]: 2598 self.update_from_vcf_sqlite(vcf_file) 2599 2600 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2601 """ 2602 It takes a VCF file and updates the INFO column of the variants table in the database with the 2603 INFO column of the VCF file 2604 2605 :param vcf_file: the path to the VCF file 2606 """ 2607 2608 # varaints table 2609 table_variants = self.get_table_variants() 2610 2611 # Loading VCF into temporaire table 2612 skip = self.get_header_length(file=vcf_file) 2613 vcf_df = pd.read_csv( 2614 vcf_file, 2615 sep="\t", 2616 engine="c", 2617 skiprows=skip, 2618 header=0, 2619 low_memory=False, 2620 ) 2621 sql_query_update = f""" 2622 UPDATE {table_variants} as table_variants 2623 SET INFO = concat( 2624 CASE 2625 WHEN INFO NOT IN ('', '.') 2626 THEN INFO 2627 ELSE '' 2628 END, 2629 ( 2630 SELECT 2631 concat( 2632 CASE 2633 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2634 THEN ';' 2635 ELSE '' 2636 END 2637 , 2638 CASE 2639 WHEN table_parquet.INFO NOT IN ('','.') 2640 THEN table_parquet.INFO 2641 ELSE '' 2642 END 2643 ) 2644 FROM vcf_df as table_parquet 2645 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2646 AND table_parquet.\"POS\" = table_variants.\"POS\" 2647 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2648 AND table_parquet.\"REF\" = table_variants.\"REF\" 2649 AND table_parquet.INFO NOT IN ('','.') 2650 ) 2651 ) 2652 ; 2653 """ 2654 self.conn.execute(sql_query_update) 2655 2656 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2657 """ 2658 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2659 table, then updates the INFO column of the variants table with the INFO column of the temporary 2660 table 2661 2662 :param vcf_file: The path to the VCF file you want to update the database with 2663 """ 2664 2665 # Create a temporary table for the VCF 2666 table_vcf = "tmp_vcf" 2667 sql_create = ( 2668 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2669 ) 2670 self.conn.execute(sql_create) 2671 2672 # Loading VCF into temporaire table 2673 vcf_df = pd.read_csv( 2674 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2675 ) 2676 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2677 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2678 2679 # Update table 'variants' with VCF data 2680 # warning: CONCAT as || operator 2681 sql_query_update = f""" 2682 UPDATE variants as table_variants 2683 SET INFO = CASE 2684 WHEN INFO NOT IN ('', '.') 2685 THEN INFO 2686 ELSE '' 2687 END || 2688 ( 2689 SELECT 2690 CASE 2691 WHEN table_variants.INFO NOT IN ('','.') 2692 AND table_vcf.INFO NOT IN ('','.') 2693 THEN ';' 2694 ELSE '' 2695 END || 2696 CASE 2697 WHEN table_vcf.INFO NOT IN ('','.') 2698 THEN table_vcf.INFO 2699 ELSE '' 2700 END 2701 FROM {table_vcf} as table_vcf 2702 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2703 AND table_vcf.\"POS\" = table_variants.\"POS\" 2704 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2705 AND table_vcf.\"REF\" = table_variants.\"REF\" 2706 ) 2707 """ 2708 self.conn.execute(sql_query_update) 2709 2710 # Drop temporary table 2711 sql_drop = f"DROP TABLE {table_vcf}" 2712 self.conn.execute(sql_drop) 2713 2714 def drop_variants_table(self) -> None: 2715 """ 2716 > This function drops the variants table 2717 """ 2718 2719 table_variants = self.get_table_variants() 2720 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2721 self.conn.execute(sql_table_variants) 2722 2723 def set_variant_id( 2724 self, variant_id_column: str = "variant_id", force: bool = None 2725 ) -> str: 2726 """ 2727 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2728 `#CHROM`, `POS`, `REF`, and `ALT` columns 2729 2730 :param variant_id_column: The name of the column to be created in the variants table, defaults 2731 to variant_id 2732 :type variant_id_column: str (optional) 2733 :param force: If True, the variant_id column will be created even if it already exists 2734 :type force: bool 2735 :return: The name of the column that contains the variant_id 2736 """ 2737 2738 # Assembly 2739 assembly = self.get_param().get( 2740 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2741 ) 2742 2743 # INFO/Tag prefix 2744 prefix = self.get_explode_infos_prefix() 2745 2746 # Explode INFO/SVTYPE 2747 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2748 2749 # variants table 2750 table_variants = self.get_table_variants() 2751 2752 # variant_id column 2753 if not variant_id_column: 2754 variant_id_column = "variant_id" 2755 2756 # Creta variant_id column 2757 if "variant_id" not in self.get_extra_infos() or force: 2758 2759 # Create column 2760 self.add_column( 2761 table_name=table_variants, 2762 column_name=variant_id_column, 2763 column_type="UBIGINT", 2764 default_value="0", 2765 ) 2766 2767 # Update column 2768 self.conn.execute( 2769 f""" 2770 UPDATE {table_variants} 2771 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2772 """ 2773 ) 2774 2775 # Remove added columns 2776 for added_column in added_columns: 2777 self.drop_column(column=added_column) 2778 2779 # return variant_id column name 2780 return variant_id_column 2781 2782 def get_variant_id_column( 2783 self, variant_id_column: str = "variant_id", force: bool = None 2784 ) -> str: 2785 """ 2786 This function returns the variant_id column name 2787 2788 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2789 defaults to variant_id 2790 :type variant_id_column: str (optional) 2791 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2792 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2793 if it is not already set, or if it is set 2794 :type force: bool 2795 :return: The variant_id column name. 2796 """ 2797 2798 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2799 2800 ### 2801 # Annotation 2802 ### 2803 2804 def scan_databases( 2805 self, 2806 database_formats: list = ["parquet"], 2807 database_releases: list = ["current"], 2808 ) -> dict: 2809 """ 2810 The function `scan_databases` scans for available databases based on specified formats and 2811 releases. 2812 2813 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2814 of the databases to be scanned. In this case, the accepted format is "parquet" 2815 :type database_formats: list ["parquet"] 2816 :param database_releases: The `database_releases` parameter is a list that specifies the 2817 releases of the databases to be scanned. In the provided function, the default value for 2818 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2819 databases that are in the "current" 2820 :type database_releases: list 2821 :return: The function `scan_databases` returns a dictionary containing information about 2822 databases that match the specified formats and releases. 2823 """ 2824 2825 # Config 2826 config = self.get_config() 2827 2828 # Param 2829 param = self.get_param() 2830 2831 # Param - Assembly 2832 assembly = param.get("assembly", config.get("assembly", None)) 2833 if not assembly: 2834 assembly = DEFAULT_ASSEMBLY 2835 log.warning(f"Default assembly '{assembly}'") 2836 2837 # Scan for availabled databases 2838 log.info( 2839 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2840 ) 2841 databases_infos_dict = databases_infos( 2842 database_folder_releases=database_releases, 2843 database_formats=database_formats, 2844 assembly=assembly, 2845 config=config, 2846 ) 2847 log.info( 2848 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2849 ) 2850 2851 return databases_infos_dict 2852 2853 def annotation(self) -> None: 2854 """ 2855 It annotates the VCF file with the annotations specified in the config file. 2856 """ 2857 2858 # Config 2859 config = self.get_config() 2860 2861 # Param 2862 param = self.get_param() 2863 2864 # Param - Assembly 2865 assembly = param.get("assembly", config.get("assembly", None)) 2866 if not assembly: 2867 assembly = DEFAULT_ASSEMBLY 2868 log.warning(f"Default assembly '{assembly}'") 2869 2870 # annotations databases folders 2871 annotations_databases = set( 2872 config.get("folders", {}) 2873 .get("databases", {}) 2874 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2875 + config.get("folders", {}) 2876 .get("databases", {}) 2877 .get("parquet", ["~/howard/databases/parquet/current"]) 2878 + config.get("folders", {}) 2879 .get("databases", {}) 2880 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2881 ) 2882 2883 # Get param annotations 2884 if param.get("annotations", None) and isinstance( 2885 param.get("annotations", None), str 2886 ): 2887 log.debug(param.get("annotations", None)) 2888 param_annotation_list = param.get("annotations").split(",") 2889 else: 2890 param_annotation_list = [] 2891 2892 # Each tools param 2893 if param.get("annotation_parquet", None) != None: 2894 log.debug( 2895 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2896 ) 2897 if isinstance(param.get("annotation_parquet", None), list): 2898 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2899 else: 2900 param_annotation_list.append(param.get("annotation_parquet")) 2901 if param.get("annotation_snpsift", None) != None: 2902 if isinstance(param.get("annotation_snpsift", None), list): 2903 param_annotation_list.append( 2904 "snpsift:" 2905 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2906 ) 2907 else: 2908 param_annotation_list.append( 2909 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2910 ) 2911 if param.get("annotation_snpeff", None) != None: 2912 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2913 if param.get("annotation_bcftools", None) != None: 2914 if isinstance(param.get("annotation_bcftools", None), list): 2915 param_annotation_list.append( 2916 "bcftools:" 2917 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2918 ) 2919 else: 2920 param_annotation_list.append( 2921 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2922 ) 2923 if param.get("annotation_annovar", None) != None: 2924 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2925 if param.get("annotation_exomiser", None) != None: 2926 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2927 if param.get("annotation_splice", None) != None: 2928 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2929 2930 # Merge param annotations list 2931 param["annotations"] = ",".join(param_annotation_list) 2932 2933 # debug 2934 log.debug(f"param_annotations={param['annotations']}") 2935 2936 if param.get("annotations"): 2937 2938 # Log 2939 # log.info("Annotations - Check annotation parameters") 2940 2941 if not "annotation" in param: 2942 param["annotation"] = {} 2943 2944 # List of annotations parameters 2945 annotations_list_input = {} 2946 if isinstance(param.get("annotations", None), str): 2947 annotation_file_list = [ 2948 value for value in param.get("annotations", "").split(",") 2949 ] 2950 for annotation_file in annotation_file_list: 2951 annotations_list_input[annotation_file] = {"INFO": None} 2952 else: 2953 annotations_list_input = param.get("annotations", {}) 2954 2955 log.info(f"Quick Annotations:") 2956 for annotation_key in list(annotations_list_input.keys()): 2957 log.info(f" {annotation_key}") 2958 2959 # List of annotations and associated fields 2960 annotations_list = {} 2961 2962 for annotation_file in annotations_list_input: 2963 2964 # Explode annotations if ALL 2965 if ( 2966 annotation_file.upper() == "ALL" 2967 or annotation_file.upper().startswith("ALL:") 2968 ): 2969 2970 # check ALL parameters (formats, releases) 2971 annotation_file_split = annotation_file.split(":") 2972 database_formats = "parquet" 2973 database_releases = "current" 2974 for annotation_file_option in annotation_file_split[1:]: 2975 database_all_options_split = annotation_file_option.split("=") 2976 if database_all_options_split[0] == "format": 2977 database_formats = database_all_options_split[1].split("+") 2978 if database_all_options_split[0] == "release": 2979 database_releases = database_all_options_split[1].split("+") 2980 2981 # Scan for availabled databases 2982 databases_infos_dict = self.scan_databases( 2983 database_formats=database_formats, 2984 database_releases=database_releases, 2985 ) 2986 2987 # Add found databases in annotation parameters 2988 for database_infos in databases_infos_dict.keys(): 2989 annotations_list[database_infos] = {"INFO": None} 2990 2991 else: 2992 annotations_list[annotation_file] = annotations_list_input[ 2993 annotation_file 2994 ] 2995 2996 # Check each databases 2997 if len(annotations_list): 2998 2999 log.info( 3000 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3001 ) 3002 3003 for annotation_file in annotations_list: 3004 3005 # Init 3006 annotations = annotations_list.get(annotation_file, None) 3007 3008 # Annotation snpEff 3009 if annotation_file.startswith("snpeff"): 3010 3011 log.debug(f"Quick Annotation snpEff") 3012 3013 if "snpeff" not in param["annotation"]: 3014 param["annotation"]["snpeff"] = {} 3015 3016 if "options" not in param["annotation"]["snpeff"]: 3017 param["annotation"]["snpeff"]["options"] = "" 3018 3019 # snpEff options in annotations 3020 param["annotation"]["snpeff"]["options"] = "".join( 3021 annotation_file.split(":")[1:] 3022 ) 3023 3024 # Annotation Annovar 3025 elif annotation_file.startswith("annovar"): 3026 3027 log.debug(f"Quick Annotation Annovar") 3028 3029 if "annovar" not in param["annotation"]: 3030 param["annotation"]["annovar"] = {} 3031 3032 if "annotations" not in param["annotation"]["annovar"]: 3033 param["annotation"]["annovar"]["annotations"] = {} 3034 3035 # Options 3036 annotation_file_split = annotation_file.split(":") 3037 for annotation_file_annotation in annotation_file_split[1:]: 3038 if annotation_file_annotation: 3039 param["annotation"]["annovar"]["annotations"][ 3040 annotation_file_annotation 3041 ] = annotations 3042 3043 # Annotation Exomiser 3044 elif annotation_file.startswith("exomiser"): 3045 3046 log.debug(f"Quick Annotation Exomiser") 3047 3048 param["annotation"]["exomiser"] = params_string_to_dict( 3049 annotation_file 3050 ) 3051 3052 # Annotation Splice 3053 elif annotation_file.startswith("splice"): 3054 3055 log.debug(f"Quick Annotation Splice") 3056 3057 param["annotation"]["splice"] = params_string_to_dict( 3058 annotation_file 3059 ) 3060 3061 # Annotation Parquet or BCFTOOLS 3062 else: 3063 3064 # Tools detection 3065 if annotation_file.startswith("bcftools:"): 3066 annotation_tool_initial = "bcftools" 3067 annotation_file = ":".join(annotation_file.split(":")[1:]) 3068 elif annotation_file.startswith("snpsift:"): 3069 annotation_tool_initial = "snpsift" 3070 annotation_file = ":".join(annotation_file.split(":")[1:]) 3071 else: 3072 annotation_tool_initial = None 3073 3074 # list of files 3075 annotation_file_list = annotation_file.replace("+", ":").split( 3076 ":" 3077 ) 3078 3079 for annotation_file in annotation_file_list: 3080 3081 if annotation_file: 3082 3083 # Annotation tool initial 3084 annotation_tool = annotation_tool_initial 3085 3086 # Find file 3087 annotation_file_found = None 3088 3089 # Expand user 3090 annotation_file = full_path(annotation_file) 3091 3092 if os.path.exists(annotation_file): 3093 annotation_file_found = annotation_file 3094 3095 else: 3096 # Find within assembly folders 3097 for annotations_database in annotations_databases: 3098 found_files = find_all( 3099 annotation_file, 3100 os.path.join( 3101 annotations_database, assembly 3102 ), 3103 ) 3104 if len(found_files) > 0: 3105 annotation_file_found = found_files[0] 3106 break 3107 if not annotation_file_found and not assembly: 3108 # Find within folders 3109 for ( 3110 annotations_database 3111 ) in annotations_databases: 3112 found_files = find_all( 3113 annotation_file, annotations_database 3114 ) 3115 if len(found_files) > 0: 3116 annotation_file_found = found_files[0] 3117 break 3118 log.debug( 3119 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3120 ) 3121 3122 # Full path 3123 annotation_file_found = full_path(annotation_file_found) 3124 3125 if annotation_file_found: 3126 3127 database = Database(database=annotation_file_found) 3128 quick_annotation_format = database.get_format() 3129 quick_annotation_is_compressed = ( 3130 database.is_compressed() 3131 ) 3132 quick_annotation_is_indexed = os.path.exists( 3133 f"{annotation_file_found}.tbi" 3134 ) 3135 bcftools_preference = False 3136 3137 # Check Annotation Tool 3138 if not annotation_tool: 3139 if ( 3140 bcftools_preference 3141 and quick_annotation_format 3142 in ["vcf", "bed"] 3143 and quick_annotation_is_compressed 3144 and quick_annotation_is_indexed 3145 ): 3146 annotation_tool = "bcftools" 3147 elif quick_annotation_format in [ 3148 "vcf", 3149 "bed", 3150 "tsv", 3151 "tsv", 3152 "csv", 3153 "json", 3154 "tbl", 3155 "parquet", 3156 "duckdb", 3157 ]: 3158 annotation_tool = "parquet" 3159 else: 3160 log.error( 3161 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3162 ) 3163 raise ValueError( 3164 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3165 ) 3166 3167 log.debug( 3168 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3169 ) 3170 3171 # Annotation Tool dispatch 3172 if annotation_tool: 3173 if annotation_tool not in param["annotation"]: 3174 param["annotation"][annotation_tool] = {} 3175 if ( 3176 "annotations" 3177 not in param["annotation"][annotation_tool] 3178 ): 3179 param["annotation"][annotation_tool][ 3180 "annotations" 3181 ] = {} 3182 param["annotation"][annotation_tool][ 3183 "annotations" 3184 ][annotation_file_found] = annotations 3185 3186 else: 3187 log.error( 3188 f"Quick Annotation File {annotation_file} does NOT exist" 3189 ) 3190 3191 self.set_param(param) 3192 3193 if param.get("annotation", None): 3194 log.info("Annotations") 3195 if param.get("annotation", {}).get("parquet", None): 3196 log.info("Annotations 'parquet'...") 3197 self.annotation_parquet() 3198 if param.get("annotation", {}).get("bcftools", None): 3199 log.info("Annotations 'bcftools'...") 3200 self.annotation_bcftools() 3201 if param.get("annotation", {}).get("snpsift", None): 3202 log.info("Annotations 'snpsift'...") 3203 self.annotation_snpsift() 3204 if param.get("annotation", {}).get("annovar", None): 3205 log.info("Annotations 'annovar'...") 3206 self.annotation_annovar() 3207 if param.get("annotation", {}).get("snpeff", None): 3208 log.info("Annotations 'snpeff'...") 3209 self.annotation_snpeff() 3210 if param.get("annotation", {}).get("exomiser", None) is not None: 3211 log.info("Annotations 'exomiser'...") 3212 self.annotation_exomiser() 3213 if param.get("annotation", {}).get("splice", None) is not None: 3214 log.info("Annotations 'splice' ...") 3215 self.annotation_splice() 3216 3217 # Explode INFOS fields into table fields 3218 if self.get_explode_infos(): 3219 self.explode_infos( 3220 prefix=self.get_explode_infos_prefix(), 3221 fields=self.get_explode_infos_fields(), 3222 force=True, 3223 ) 3224 3225 def annotation_snpsift(self, threads: int = None) -> None: 3226 """ 3227 This function annotate with bcftools 3228 3229 :param threads: Number of threads to use 3230 :return: the value of the variable "return_value". 3231 """ 3232 3233 # DEBUG 3234 log.debug("Start annotation with bcftools databases") 3235 3236 # Threads 3237 if not threads: 3238 threads = self.get_threads() 3239 log.debug("Threads: " + str(threads)) 3240 3241 # Config 3242 config = self.get_config() 3243 log.debug("Config: " + str(config)) 3244 3245 # Config - snpSift 3246 snpsift_bin_command = get_bin_command( 3247 bin="SnpSift.jar", 3248 tool="snpsift", 3249 bin_type="jar", 3250 config=config, 3251 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3252 ) 3253 if not snpsift_bin_command: 3254 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3255 log.error(msg_err) 3256 raise ValueError(msg_err) 3257 3258 # Config - bcftools 3259 bcftools_bin_command = get_bin_command( 3260 bin="bcftools", 3261 tool="bcftools", 3262 bin_type="bin", 3263 config=config, 3264 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3265 ) 3266 if not bcftools_bin_command: 3267 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3268 log.error(msg_err) 3269 raise ValueError(msg_err) 3270 3271 # Config - BCFTools databases folders 3272 databases_folders = set( 3273 self.get_config() 3274 .get("folders", {}) 3275 .get("databases", {}) 3276 .get("annotations", ["."]) 3277 + self.get_config() 3278 .get("folders", {}) 3279 .get("databases", {}) 3280 .get("bcftools", ["."]) 3281 ) 3282 log.debug("Databases annotations: " + str(databases_folders)) 3283 3284 # Param 3285 annotations = ( 3286 self.get_param() 3287 .get("annotation", {}) 3288 .get("snpsift", {}) 3289 .get("annotations", None) 3290 ) 3291 log.debug("Annotations: " + str(annotations)) 3292 3293 # Assembly 3294 assembly = self.get_param().get( 3295 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3296 ) 3297 3298 # Data 3299 table_variants = self.get_table_variants() 3300 3301 # Check if not empty 3302 log.debug("Check if not empty") 3303 sql_query_chromosomes = ( 3304 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3305 ) 3306 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3307 if not sql_query_chromosomes_df["count"][0]: 3308 log.info(f"VCF empty") 3309 return 3310 3311 # VCF header 3312 vcf_reader = self.get_header() 3313 log.debug("Initial header: " + str(vcf_reader.infos)) 3314 3315 # Existing annotations 3316 for vcf_annotation in self.get_header().infos: 3317 3318 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3319 log.debug( 3320 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3321 ) 3322 3323 if annotations: 3324 3325 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3326 3327 # Export VCF file 3328 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3329 3330 # Init 3331 commands = {} 3332 3333 for annotation in annotations: 3334 annotation_fields = annotations[annotation] 3335 3336 # Annotation Name 3337 annotation_name = os.path.basename(annotation) 3338 3339 if not annotation_fields: 3340 annotation_fields = {"INFO": None} 3341 3342 log.debug(f"Annotation '{annotation_name}'") 3343 log.debug( 3344 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3345 ) 3346 3347 # Create Database 3348 database = Database( 3349 database=annotation, 3350 databases_folders=databases_folders, 3351 assembly=assembly, 3352 ) 3353 3354 # Find files 3355 db_file = database.get_database() 3356 db_file = full_path(db_file) 3357 db_hdr_file = database.get_header_file() 3358 db_hdr_file = full_path(db_hdr_file) 3359 db_file_type = database.get_format() 3360 db_tbi_file = f"{db_file}.tbi" 3361 db_file_compressed = database.is_compressed() 3362 3363 # Check if compressed 3364 if not db_file_compressed: 3365 log.error( 3366 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3367 ) 3368 raise ValueError( 3369 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3370 ) 3371 3372 # Check if indexed 3373 if not os.path.exists(db_tbi_file): 3374 log.error( 3375 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3376 ) 3377 raise ValueError( 3378 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3379 ) 3380 3381 # Check index - try to create if not exists 3382 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3383 log.error("Annotation failed: database not valid") 3384 log.error(f"Annotation annotation file: {db_file}") 3385 log.error(f"Annotation annotation header: {db_hdr_file}") 3386 log.error(f"Annotation annotation index: {db_tbi_file}") 3387 raise ValueError( 3388 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3389 ) 3390 else: 3391 3392 log.debug( 3393 f"Annotation '{annotation}' - file: " 3394 + str(db_file) 3395 + " and " 3396 + str(db_hdr_file) 3397 ) 3398 3399 # Load header as VCF object 3400 db_hdr_vcf = Variants(input=db_hdr_file) 3401 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3402 log.debug( 3403 "Annotation database header: " 3404 + str(db_hdr_vcf_header_infos) 3405 ) 3406 3407 # For all fields in database 3408 annotation_fields_full = False 3409 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3410 annotation_fields = { 3411 key: key for key in db_hdr_vcf_header_infos 3412 } 3413 log.debug( 3414 "Annotation database header - All annotations added: " 3415 + str(annotation_fields) 3416 ) 3417 annotation_fields_full = True 3418 3419 # # Create file for field rename 3420 # log.debug("Create file for field rename") 3421 # tmp_rename = NamedTemporaryFile( 3422 # prefix=self.get_prefix(), 3423 # dir=self.get_tmp_dir(), 3424 # suffix=".rename", 3425 # delete=False, 3426 # ) 3427 # tmp_rename_name = tmp_rename.name 3428 # tmp_files.append(tmp_rename_name) 3429 3430 # Number of fields 3431 nb_annotation_field = 0 3432 annotation_list = [] 3433 annotation_infos_rename_list = [] 3434 3435 for annotation_field in annotation_fields: 3436 3437 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3438 annotation_fields_new_name = annotation_fields.get( 3439 annotation_field, annotation_field 3440 ) 3441 if not annotation_fields_new_name: 3442 annotation_fields_new_name = annotation_field 3443 3444 # Check if field is in DB and if field is not elready in input data 3445 if ( 3446 annotation_field in db_hdr_vcf.get_header().infos 3447 and annotation_fields_new_name 3448 not in self.get_header().infos 3449 ): 3450 3451 log.info( 3452 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3453 ) 3454 3455 # BCFTools annotate param to rename fields 3456 if annotation_field != annotation_fields_new_name: 3457 annotation_infos_rename_list.append( 3458 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3459 ) 3460 3461 # Add INFO field to header 3462 db_hdr_vcf_header_infos_number = ( 3463 db_hdr_vcf_header_infos[annotation_field].num or "." 3464 ) 3465 db_hdr_vcf_header_infos_type = ( 3466 db_hdr_vcf_header_infos[annotation_field].type 3467 or "String" 3468 ) 3469 db_hdr_vcf_header_infos_description = ( 3470 db_hdr_vcf_header_infos[annotation_field].desc 3471 or f"{annotation_field} description" 3472 ) 3473 db_hdr_vcf_header_infos_source = ( 3474 db_hdr_vcf_header_infos[annotation_field].source 3475 or "unknown" 3476 ) 3477 db_hdr_vcf_header_infos_version = ( 3478 db_hdr_vcf_header_infos[annotation_field].version 3479 or "unknown" 3480 ) 3481 3482 vcf_reader.infos[annotation_fields_new_name] = ( 3483 vcf.parser._Info( 3484 annotation_fields_new_name, 3485 db_hdr_vcf_header_infos_number, 3486 db_hdr_vcf_header_infos_type, 3487 db_hdr_vcf_header_infos_description, 3488 db_hdr_vcf_header_infos_source, 3489 db_hdr_vcf_header_infos_version, 3490 self.code_type_map[ 3491 db_hdr_vcf_header_infos_type 3492 ], 3493 ) 3494 ) 3495 3496 annotation_list.append(annotation_field) 3497 3498 nb_annotation_field += 1 3499 3500 else: 3501 3502 if ( 3503 annotation_field 3504 not in db_hdr_vcf.get_header().infos 3505 ): 3506 log.warning( 3507 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3508 ) 3509 if ( 3510 annotation_fields_new_name 3511 in self.get_header().infos 3512 ): 3513 log.warning( 3514 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3515 ) 3516 3517 log.info( 3518 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3519 ) 3520 3521 annotation_infos = ",".join(annotation_list) 3522 3523 if annotation_infos != "": 3524 3525 # Annotated VCF (and error file) 3526 tmp_annotation_vcf_name = os.path.join( 3527 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3528 ) 3529 tmp_annotation_vcf_name_err = ( 3530 tmp_annotation_vcf_name + ".err" 3531 ) 3532 3533 # Add fields to annotate 3534 if not annotation_fields_full: 3535 annotation_infos_option = f"-info {annotation_infos}" 3536 else: 3537 annotation_infos_option = "" 3538 3539 # Info fields rename 3540 if annotation_infos_rename_list: 3541 annotation_infos_rename = " -c " + ",".join( 3542 annotation_infos_rename_list 3543 ) 3544 else: 3545 annotation_infos_rename = "" 3546 3547 # Annotate command 3548 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3549 3550 # Add command 3551 commands[command_annotate] = tmp_annotation_vcf_name 3552 3553 if commands: 3554 3555 # Export VCF file 3556 self.export_variant_vcf( 3557 vcf_file=tmp_vcf_name, 3558 remove_info=True, 3559 add_samples=False, 3560 index=True, 3561 ) 3562 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3563 3564 # Num command 3565 nb_command = 0 3566 3567 # Annotate 3568 for command_annotate in commands: 3569 nb_command += 1 3570 log.info( 3571 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3572 ) 3573 log.debug(f"command_annotate={command_annotate}") 3574 run_parallel_commands([command_annotate], threads) 3575 3576 # Debug 3577 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3578 3579 # Update variants 3580 log.info( 3581 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3582 ) 3583 self.update_from_vcf(commands[command_annotate]) 3584 3585 def annotation_bcftools(self, threads: int = None) -> None: 3586 """ 3587 This function annotate with bcftools 3588 3589 :param threads: Number of threads to use 3590 :return: the value of the variable "return_value". 3591 """ 3592 3593 # DEBUG 3594 log.debug("Start annotation with bcftools databases") 3595 3596 # Threads 3597 if not threads: 3598 threads = self.get_threads() 3599 log.debug("Threads: " + str(threads)) 3600 3601 # Config 3602 config = self.get_config() 3603 log.debug("Config: " + str(config)) 3604 3605 # DEBUG 3606 delete_tmp = True 3607 if self.get_config().get("verbosity", "warning") in ["debug"]: 3608 delete_tmp = False 3609 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3610 3611 # Config - BCFTools bin command 3612 bcftools_bin_command = get_bin_command( 3613 bin="bcftools", 3614 tool="bcftools", 3615 bin_type="bin", 3616 config=config, 3617 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3618 ) 3619 if not bcftools_bin_command: 3620 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3621 log.error(msg_err) 3622 raise ValueError(msg_err) 3623 3624 # Config - BCFTools databases folders 3625 databases_folders = set( 3626 self.get_config() 3627 .get("folders", {}) 3628 .get("databases", {}) 3629 .get("annotations", ["."]) 3630 + self.get_config() 3631 .get("folders", {}) 3632 .get("databases", {}) 3633 .get("bcftools", ["."]) 3634 ) 3635 log.debug("Databases annotations: " + str(databases_folders)) 3636 3637 # Param 3638 annotations = ( 3639 self.get_param() 3640 .get("annotation", {}) 3641 .get("bcftools", {}) 3642 .get("annotations", None) 3643 ) 3644 log.debug("Annotations: " + str(annotations)) 3645 3646 # Assembly 3647 assembly = self.get_param().get( 3648 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3649 ) 3650 3651 # Data 3652 table_variants = self.get_table_variants() 3653 3654 # Check if not empty 3655 log.debug("Check if not empty") 3656 sql_query_chromosomes = ( 3657 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3658 ) 3659 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3660 if not sql_query_chromosomes_df["count"][0]: 3661 log.info(f"VCF empty") 3662 return 3663 3664 # Export in VCF 3665 log.debug("Create initial file to annotate") 3666 tmp_vcf = NamedTemporaryFile( 3667 prefix=self.get_prefix(), 3668 dir=self.get_tmp_dir(), 3669 suffix=".vcf.gz", 3670 delete=False, 3671 ) 3672 tmp_vcf_name = tmp_vcf.name 3673 3674 # VCF header 3675 vcf_reader = self.get_header() 3676 log.debug("Initial header: " + str(vcf_reader.infos)) 3677 3678 # Existing annotations 3679 for vcf_annotation in self.get_header().infos: 3680 3681 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3682 log.debug( 3683 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3684 ) 3685 3686 if annotations: 3687 3688 tmp_ann_vcf_list = [] 3689 commands = [] 3690 tmp_files = [] 3691 err_files = [] 3692 3693 for annotation in annotations: 3694 annotation_fields = annotations[annotation] 3695 3696 # Annotation Name 3697 annotation_name = os.path.basename(annotation) 3698 3699 if not annotation_fields: 3700 annotation_fields = {"INFO": None} 3701 3702 log.debug(f"Annotation '{annotation_name}'") 3703 log.debug( 3704 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3705 ) 3706 3707 # Create Database 3708 database = Database( 3709 database=annotation, 3710 databases_folders=databases_folders, 3711 assembly=assembly, 3712 ) 3713 3714 # Find files 3715 db_file = database.get_database() 3716 db_file = full_path(db_file) 3717 db_hdr_file = database.get_header_file() 3718 db_hdr_file = full_path(db_hdr_file) 3719 db_file_type = database.get_format() 3720 db_tbi_file = f"{db_file}.tbi" 3721 db_file_compressed = database.is_compressed() 3722 3723 # Check if compressed 3724 if not db_file_compressed: 3725 log.error( 3726 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3727 ) 3728 raise ValueError( 3729 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3730 ) 3731 3732 # Check if indexed 3733 if not os.path.exists(db_tbi_file): 3734 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3735 raise ValueError( 3736 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3737 ) 3738 3739 # Check index - try to create if not exists 3740 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3741 log.error("Annotation failed: database not valid") 3742 log.error(f"Annotation annotation file: {db_file}") 3743 log.error(f"Annotation annotation header: {db_hdr_file}") 3744 log.error(f"Annotation annotation index: {db_tbi_file}") 3745 raise ValueError( 3746 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3747 ) 3748 else: 3749 3750 log.debug( 3751 f"Annotation '{annotation}' - file: " 3752 + str(db_file) 3753 + " and " 3754 + str(db_hdr_file) 3755 ) 3756 3757 # Load header as VCF object 3758 db_hdr_vcf = Variants(input=db_hdr_file) 3759 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3760 log.debug( 3761 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3762 ) 3763 3764 # For all fields in database 3765 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3766 annotation_fields = { 3767 key: key for key in db_hdr_vcf_header_infos 3768 } 3769 log.debug( 3770 "Annotation database header - All annotations added: " 3771 + str(annotation_fields) 3772 ) 3773 3774 # Number of fields 3775 nb_annotation_field = 0 3776 annotation_list = [] 3777 3778 for annotation_field in annotation_fields: 3779 3780 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3781 annotation_fields_new_name = annotation_fields.get( 3782 annotation_field, annotation_field 3783 ) 3784 if not annotation_fields_new_name: 3785 annotation_fields_new_name = annotation_field 3786 3787 # Check if field is in DB and if field is not elready in input data 3788 if ( 3789 annotation_field in db_hdr_vcf.get_header().infos 3790 and annotation_fields_new_name 3791 not in self.get_header().infos 3792 ): 3793 3794 log.info( 3795 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3796 ) 3797 3798 # Add INFO field to header 3799 db_hdr_vcf_header_infos_number = ( 3800 db_hdr_vcf_header_infos[annotation_field].num or "." 3801 ) 3802 db_hdr_vcf_header_infos_type = ( 3803 db_hdr_vcf_header_infos[annotation_field].type 3804 or "String" 3805 ) 3806 db_hdr_vcf_header_infos_description = ( 3807 db_hdr_vcf_header_infos[annotation_field].desc 3808 or f"{annotation_field} description" 3809 ) 3810 db_hdr_vcf_header_infos_source = ( 3811 db_hdr_vcf_header_infos[annotation_field].source 3812 or "unknown" 3813 ) 3814 db_hdr_vcf_header_infos_version = ( 3815 db_hdr_vcf_header_infos[annotation_field].version 3816 or "unknown" 3817 ) 3818 3819 vcf_reader.infos[annotation_fields_new_name] = ( 3820 vcf.parser._Info( 3821 annotation_fields_new_name, 3822 db_hdr_vcf_header_infos_number, 3823 db_hdr_vcf_header_infos_type, 3824 db_hdr_vcf_header_infos_description, 3825 db_hdr_vcf_header_infos_source, 3826 db_hdr_vcf_header_infos_version, 3827 self.code_type_map[db_hdr_vcf_header_infos_type], 3828 ) 3829 ) 3830 3831 # annotation_list.append(annotation_field) 3832 if annotation_field != annotation_fields_new_name: 3833 annotation_list.append( 3834 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3835 ) 3836 else: 3837 annotation_list.append(annotation_field) 3838 3839 nb_annotation_field += 1 3840 3841 else: 3842 3843 if annotation_field not in db_hdr_vcf.get_header().infos: 3844 log.warning( 3845 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3846 ) 3847 if annotation_fields_new_name in self.get_header().infos: 3848 log.warning( 3849 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3850 ) 3851 3852 log.info( 3853 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3854 ) 3855 3856 annotation_infos = ",".join(annotation_list) 3857 3858 if annotation_infos != "": 3859 3860 # Protect header for bcftools (remove "#CHROM" and variants line) 3861 log.debug("Protect Header file - remove #CHROM line if exists") 3862 tmp_header_vcf = NamedTemporaryFile( 3863 prefix=self.get_prefix(), 3864 dir=self.get_tmp_dir(), 3865 suffix=".hdr", 3866 delete=False, 3867 ) 3868 tmp_header_vcf_name = tmp_header_vcf.name 3869 tmp_files.append(tmp_header_vcf_name) 3870 # Command 3871 if db_hdr_file.endswith(".gz"): 3872 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3873 else: 3874 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3875 # Run 3876 run_parallel_commands([command_extract_header], 1) 3877 3878 # Find chomosomes 3879 log.debug("Find chromosomes ") 3880 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3881 sql_query_chromosomes_df = self.get_query_to_df( 3882 sql_query_chromosomes 3883 ) 3884 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3885 3886 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3887 3888 # BED columns in the annotation file 3889 if db_file_type in ["bed"]: 3890 annotation_infos = "CHROM,POS,POS," + annotation_infos 3891 3892 for chrom in chomosomes_list: 3893 3894 # Create BED on initial VCF 3895 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3896 tmp_bed = NamedTemporaryFile( 3897 prefix=self.get_prefix(), 3898 dir=self.get_tmp_dir(), 3899 suffix=".bed", 3900 delete=False, 3901 ) 3902 tmp_bed_name = tmp_bed.name 3903 tmp_files.append(tmp_bed_name) 3904 3905 # Detecte regions 3906 log.debug( 3907 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3908 ) 3909 window = 1000000 3910 sql_query_intervals_for_bed = f""" 3911 SELECT \"#CHROM\", 3912 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3913 \"POS\"+{window} 3914 FROM {table_variants} as table_variants 3915 WHERE table_variants.\"#CHROM\" = '{chrom}' 3916 """ 3917 regions = self.conn.execute( 3918 sql_query_intervals_for_bed 3919 ).fetchall() 3920 merged_regions = merge_regions(regions) 3921 log.debug( 3922 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3923 ) 3924 3925 header = ["#CHROM", "START", "END"] 3926 with open(tmp_bed_name, "w") as f: 3927 # Write the header with tab delimiter 3928 f.write("\t".join(header) + "\n") 3929 for d in merged_regions: 3930 # Write each data row with tab delimiter 3931 f.write("\t".join(map(str, d)) + "\n") 3932 3933 # Tmp files 3934 tmp_annotation_vcf = NamedTemporaryFile( 3935 prefix=self.get_prefix(), 3936 dir=self.get_tmp_dir(), 3937 suffix=".vcf.gz", 3938 delete=False, 3939 ) 3940 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3941 tmp_files.append(tmp_annotation_vcf_name) 3942 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3943 tmp_annotation_vcf_name_err = ( 3944 tmp_annotation_vcf_name + ".err" 3945 ) 3946 err_files.append(tmp_annotation_vcf_name_err) 3947 3948 # Annotate Command 3949 log.debug( 3950 f"Annotation '{annotation}' - add bcftools command" 3951 ) 3952 3953 # Command 3954 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3955 3956 # Add command 3957 commands.append(command_annotate) 3958 3959 # if some commands 3960 if commands: 3961 3962 # Export VCF file 3963 self.export_variant_vcf( 3964 vcf_file=tmp_vcf_name, 3965 remove_info=True, 3966 add_samples=False, 3967 index=True, 3968 ) 3969 3970 # Threads 3971 # calculate threads for annotated commands 3972 if commands: 3973 threads_bcftools_annotate = round(threads / len(commands)) 3974 else: 3975 threads_bcftools_annotate = 1 3976 3977 if not threads_bcftools_annotate: 3978 threads_bcftools_annotate = 1 3979 3980 # Add threads option to bcftools commands 3981 if threads_bcftools_annotate > 1: 3982 commands_threaded = [] 3983 for command in commands: 3984 commands_threaded.append( 3985 command.replace( 3986 f"{bcftools_bin_command} annotate ", 3987 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3988 ) 3989 ) 3990 commands = commands_threaded 3991 3992 # Command annotation multithreading 3993 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3994 log.info( 3995 f"Annotation - Annotation multithreaded in " 3996 + str(len(commands)) 3997 + " commands" 3998 ) 3999 4000 run_parallel_commands(commands, threads) 4001 4002 # Merge 4003 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4004 4005 if tmp_ann_vcf_list_cmd: 4006 4007 # Tmp file 4008 tmp_annotate_vcf = NamedTemporaryFile( 4009 prefix=self.get_prefix(), 4010 dir=self.get_tmp_dir(), 4011 suffix=".vcf.gz", 4012 delete=True, 4013 ) 4014 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4015 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4016 err_files.append(tmp_annotate_vcf_name_err) 4017 4018 # Tmp file remove command 4019 tmp_files_remove_command = "" 4020 if tmp_files: 4021 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4022 4023 # Command merge 4024 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4025 log.info( 4026 f"Annotation - Annotation merging " 4027 + str(len(commands)) 4028 + " annotated files" 4029 ) 4030 log.debug(f"Annotation - merge command: {merge_command}") 4031 run_parallel_commands([merge_command], 1) 4032 4033 # Error messages 4034 log.info(f"Error/Warning messages:") 4035 error_message_command_all = [] 4036 error_message_command_warning = [] 4037 error_message_command_err = [] 4038 for err_file in err_files: 4039 with open(err_file, "r") as f: 4040 for line in f: 4041 message = line.strip() 4042 error_message_command_all.append(message) 4043 if line.startswith("[W::"): 4044 error_message_command_warning.append(message) 4045 if line.startswith("[E::"): 4046 error_message_command_err.append( 4047 f"{err_file}: " + message 4048 ) 4049 # log info 4050 for message in list( 4051 set(error_message_command_err + error_message_command_warning) 4052 ): 4053 log.info(f" {message}") 4054 # debug info 4055 for message in list(set(error_message_command_all)): 4056 log.debug(f" {message}") 4057 # failed 4058 if len(error_message_command_err): 4059 log.error("Annotation failed: Error in commands") 4060 raise ValueError("Annotation failed: Error in commands") 4061 4062 # Update variants 4063 log.info(f"Annotation - Updating...") 4064 self.update_from_vcf(tmp_annotate_vcf_name) 4065 4066 def annotation_exomiser(self, threads: int = None) -> None: 4067 """ 4068 This function annotate with Exomiser 4069 4070 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4071 - "analysis" (dict/file): 4072 Full analysis dictionnary parameters (see Exomiser docs). 4073 Either a dict, or a file in JSON or YAML format. 4074 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4075 Default : None 4076 - "preset" (string): 4077 Analysis preset (available in config folder). 4078 Used if no full "analysis" is provided. 4079 Default: "exome" 4080 - "phenopacket" (dict/file): 4081 Samples and phenotipic features parameters (see Exomiser docs). 4082 Either a dict, or a file in JSON or YAML format. 4083 Default: None 4084 - "subject" (dict): 4085 Sample parameters (see Exomiser docs). 4086 Example: 4087 "subject": 4088 { 4089 "id": "ISDBM322017", 4090 "sex": "FEMALE" 4091 } 4092 Default: None 4093 - "sample" (string): 4094 Sample name to construct "subject" section: 4095 "subject": 4096 { 4097 "id": "<sample>", 4098 "sex": "UNKNOWN_SEX" 4099 } 4100 Default: None 4101 - "phenotypicFeatures" (dict) 4102 Phenotypic features to construct "subject" section. 4103 Example: 4104 "phenotypicFeatures": 4105 [ 4106 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4107 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4108 ] 4109 - "hpo" (list) 4110 List of HPO ids as phenotypic features. 4111 Example: 4112 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4113 Default: [] 4114 - "outputOptions" (dict): 4115 Output options (see Exomiser docs). 4116 Default: 4117 "output_options" = 4118 { 4119 "outputContributingVariantsOnly": False, 4120 "numGenes": 0, 4121 "outputFormats": ["TSV_VARIANT", "VCF"] 4122 } 4123 - "transcript_source" (string): 4124 Transcript source (either "refseq", "ucsc", "ensembl") 4125 Default: "refseq" 4126 - "exomiser_to_info" (boolean): 4127 Add exomiser TSV file columns as INFO fields in VCF. 4128 Default: False 4129 - "release" (string): 4130 Exomise database release. 4131 If not exists, database release will be downloaded (take a while). 4132 Default: None (provided by application.properties configuration file) 4133 - "exomiser_application_properties" (file): 4134 Exomiser configuration file (see Exomiser docs). 4135 Useful to automatically download databases (especially for specific genome databases). 4136 4137 Notes: 4138 - If no sample in parameters, first sample in VCF will be chosen 4139 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4140 4141 :param threads: The number of threads to use 4142 :return: None. 4143 """ 4144 4145 # DEBUG 4146 log.debug("Start annotation with Exomiser databases") 4147 4148 # Threads 4149 if not threads: 4150 threads = self.get_threads() 4151 log.debug("Threads: " + str(threads)) 4152 4153 # Config 4154 config = self.get_config() 4155 log.debug("Config: " + str(config)) 4156 4157 # Config - Folders - Databases 4158 databases_folders = ( 4159 config.get("folders", {}) 4160 .get("databases", {}) 4161 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4162 ) 4163 databases_folders = full_path(databases_folders) 4164 if not os.path.exists(databases_folders): 4165 log.error(f"Databases annotations: {databases_folders} NOT found") 4166 log.debug("Databases annotations: " + str(databases_folders)) 4167 4168 # Config - Exomiser 4169 exomiser_bin_command = get_bin_command( 4170 bin="exomiser-cli*.jar", 4171 tool="exomiser", 4172 bin_type="jar", 4173 config=config, 4174 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4175 ) 4176 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4177 if not exomiser_bin_command: 4178 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4179 log.error(msg_err) 4180 raise ValueError(msg_err) 4181 4182 # Param 4183 param = self.get_param() 4184 log.debug("Param: " + str(param)) 4185 4186 # Param - Exomiser 4187 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4188 log.debug(f"Param Exomiser: {param_exomiser}") 4189 4190 # Param - Assembly 4191 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4192 log.debug("Assembly: " + str(assembly)) 4193 4194 # Data 4195 table_variants = self.get_table_variants() 4196 4197 # Check if not empty 4198 log.debug("Check if not empty") 4199 sql_query_chromosomes = ( 4200 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4201 ) 4202 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4203 log.info(f"VCF empty") 4204 return False 4205 4206 # VCF header 4207 vcf_reader = self.get_header() 4208 log.debug("Initial header: " + str(vcf_reader.infos)) 4209 4210 # Samples 4211 samples = self.get_header_sample_list() 4212 if not samples: 4213 log.error("No Samples in VCF") 4214 return False 4215 log.debug(f"Samples: {samples}") 4216 4217 # Memory limit 4218 memory_limit = self.get_memory("8G") 4219 log.debug(f"memory_limit: {memory_limit}") 4220 4221 # Exomiser java options 4222 exomiser_java_options = ( 4223 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4224 ) 4225 log.debug(f"Exomiser java options: {exomiser_java_options}") 4226 4227 # Download Exomiser (if not exists) 4228 exomiser_release = param_exomiser.get("release", None) 4229 exomiser_application_properties = param_exomiser.get( 4230 "exomiser_application_properties", None 4231 ) 4232 databases_download_exomiser( 4233 assemblies=[assembly], 4234 exomiser_folder=databases_folders, 4235 exomiser_release=exomiser_release, 4236 exomiser_phenotype_release=exomiser_release, 4237 exomiser_application_properties=exomiser_application_properties, 4238 ) 4239 4240 # Force annotation 4241 force_update_annotation = True 4242 4243 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4244 log.debug("Start annotation Exomiser") 4245 4246 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4247 4248 # tmp_dir = "/tmp/exomiser" 4249 4250 ### ANALYSIS ### 4251 ################ 4252 4253 # Create analysis.json through analysis dict 4254 # either analysis in param or by default 4255 # depending on preset exome/genome) 4256 4257 # Init analysis dict 4258 param_exomiser_analysis_dict = {} 4259 4260 # analysis from param 4261 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4262 param_exomiser_analysis = full_path(param_exomiser_analysis) 4263 4264 # If analysis in param -> load anlaysis json 4265 if param_exomiser_analysis: 4266 4267 # If param analysis is a file and exists 4268 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4269 param_exomiser_analysis 4270 ): 4271 # Load analysis file into analysis dict (either yaml or json) 4272 with open(param_exomiser_analysis) as json_file: 4273 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4274 4275 # If param analysis is a dict 4276 elif isinstance(param_exomiser_analysis, dict): 4277 # Load analysis dict into analysis dict (either yaml or json) 4278 param_exomiser_analysis_dict = param_exomiser_analysis 4279 4280 # Error analysis type 4281 else: 4282 log.error(f"Analysis type unknown. Check param file.") 4283 raise ValueError(f"Analysis type unknown. Check param file.") 4284 4285 # Case no input analysis config file/dict 4286 # Use preset (exome/genome) to open default config file 4287 if not param_exomiser_analysis_dict: 4288 4289 # default preset 4290 default_preset = "exome" 4291 4292 # Get param preset or default preset 4293 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4294 4295 # Try to find if preset is a file 4296 if os.path.exists(param_exomiser_preset): 4297 # Preset file is provided in full path 4298 param_exomiser_analysis_default_config_file = ( 4299 param_exomiser_preset 4300 ) 4301 # elif os.path.exists(full_path(param_exomiser_preset)): 4302 # # Preset file is provided in full path 4303 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4304 elif os.path.exists( 4305 os.path.join(folder_config, param_exomiser_preset) 4306 ): 4307 # Preset file is provided a basename in config folder (can be a path with subfolders) 4308 param_exomiser_analysis_default_config_file = os.path.join( 4309 folder_config, param_exomiser_preset 4310 ) 4311 else: 4312 # Construct preset file 4313 param_exomiser_analysis_default_config_file = os.path.join( 4314 folder_config, 4315 f"preset-{param_exomiser_preset}-analysis.json", 4316 ) 4317 4318 # If preset file exists 4319 param_exomiser_analysis_default_config_file = full_path( 4320 param_exomiser_analysis_default_config_file 4321 ) 4322 if os.path.exists(param_exomiser_analysis_default_config_file): 4323 # Load prest file into analysis dict (either yaml or json) 4324 with open( 4325 param_exomiser_analysis_default_config_file 4326 ) as json_file: 4327 # param_exomiser_analysis_dict[""] = json.load(json_file) 4328 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4329 json_file 4330 ) 4331 4332 # Error preset file 4333 else: 4334 log.error( 4335 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4336 ) 4337 raise ValueError( 4338 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4339 ) 4340 4341 # If no analysis dict created 4342 if not param_exomiser_analysis_dict: 4343 log.error(f"No analysis config") 4344 raise ValueError(f"No analysis config") 4345 4346 # Log 4347 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4348 4349 ### PHENOPACKET ### 4350 ################### 4351 4352 # If no PhenoPacket in analysis dict -> check in param 4353 if "phenopacket" not in param_exomiser_analysis_dict: 4354 4355 # If PhenoPacket in param -> load anlaysis json 4356 if param_exomiser.get("phenopacket", None): 4357 4358 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4359 param_exomiser_phenopacket = full_path( 4360 param_exomiser_phenopacket 4361 ) 4362 4363 # If param phenopacket is a file and exists 4364 if isinstance( 4365 param_exomiser_phenopacket, str 4366 ) and os.path.exists(param_exomiser_phenopacket): 4367 # Load phenopacket file into analysis dict (either yaml or json) 4368 with open(param_exomiser_phenopacket) as json_file: 4369 param_exomiser_analysis_dict["phenopacket"] = ( 4370 yaml.safe_load(json_file) 4371 ) 4372 4373 # If param phenopacket is a dict 4374 elif isinstance(param_exomiser_phenopacket, dict): 4375 # Load phenopacket dict into analysis dict (either yaml or json) 4376 param_exomiser_analysis_dict["phenopacket"] = ( 4377 param_exomiser_phenopacket 4378 ) 4379 4380 # Error phenopacket type 4381 else: 4382 log.error(f"Phenopacket type unknown. Check param file.") 4383 raise ValueError( 4384 f"Phenopacket type unknown. Check param file." 4385 ) 4386 4387 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4388 if "phenopacket" not in param_exomiser_analysis_dict: 4389 4390 # Init PhenoPacket 4391 param_exomiser_analysis_dict["phenopacket"] = { 4392 "id": "analysis", 4393 "proband": {}, 4394 } 4395 4396 ### Add subject ### 4397 4398 # If subject exists 4399 param_exomiser_subject = param_exomiser.get("subject", {}) 4400 4401 # If subject not exists -> found sample ID 4402 if not param_exomiser_subject: 4403 4404 # Found sample ID in param 4405 sample = param_exomiser.get("sample", None) 4406 4407 # Find sample ID (first sample) 4408 if not sample: 4409 sample_list = self.get_header_sample_list() 4410 if len(sample_list) > 0: 4411 sample = sample_list[0] 4412 else: 4413 log.error(f"No sample found") 4414 raise ValueError(f"No sample found") 4415 4416 # Create subject 4417 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4418 4419 # Add to dict 4420 param_exomiser_analysis_dict["phenopacket"][ 4421 "subject" 4422 ] = param_exomiser_subject 4423 4424 ### Add "phenotypicFeatures" ### 4425 4426 # If phenotypicFeatures exists 4427 param_exomiser_phenotypicfeatures = param_exomiser.get( 4428 "phenotypicFeatures", [] 4429 ) 4430 4431 # If phenotypicFeatures not exists -> Try to infer from hpo list 4432 if not param_exomiser_phenotypicfeatures: 4433 4434 # Found HPO in param 4435 param_exomiser_hpo = param_exomiser.get("hpo", []) 4436 4437 # Split HPO if list in string format separated by comma 4438 if isinstance(param_exomiser_hpo, str): 4439 param_exomiser_hpo = param_exomiser_hpo.split(",") 4440 4441 # Create HPO list 4442 for hpo in param_exomiser_hpo: 4443 hpo_clean = re.sub("[^0-9]", "", hpo) 4444 param_exomiser_phenotypicfeatures.append( 4445 { 4446 "type": { 4447 "id": f"HP:{hpo_clean}", 4448 "label": f"HP:{hpo_clean}", 4449 } 4450 } 4451 ) 4452 4453 # Add to dict 4454 param_exomiser_analysis_dict["phenopacket"][ 4455 "phenotypicFeatures" 4456 ] = param_exomiser_phenotypicfeatures 4457 4458 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4459 if not param_exomiser_phenotypicfeatures: 4460 for step in param_exomiser_analysis_dict.get( 4461 "analysis", {} 4462 ).get("steps", []): 4463 if "hiPhivePrioritiser" in step: 4464 param_exomiser_analysis_dict.get("analysis", {}).get( 4465 "steps", [] 4466 ).remove(step) 4467 4468 ### Add Input File ### 4469 4470 # Initial file name and htsFiles 4471 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4472 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4473 { 4474 "uri": tmp_vcf_name, 4475 "htsFormat": "VCF", 4476 "genomeAssembly": assembly, 4477 } 4478 ] 4479 4480 ### Add metaData ### 4481 4482 # If metaData not in analysis dict 4483 if "metaData" not in param_exomiser_analysis_dict: 4484 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4485 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4486 "createdBy": "howard", 4487 "phenopacketSchemaVersion": 1, 4488 } 4489 4490 ### OutputOptions ### 4491 4492 # Init output result folder 4493 output_results = os.path.join(tmp_dir, "results") 4494 4495 # If no outputOptions in analysis dict 4496 if "outputOptions" not in param_exomiser_analysis_dict: 4497 4498 # default output formats 4499 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4500 4501 # Get outputOptions in param 4502 output_options = param_exomiser.get("outputOptions", None) 4503 4504 # If no output_options in param -> check 4505 if not output_options: 4506 output_options = { 4507 "outputContributingVariantsOnly": False, 4508 "numGenes": 0, 4509 "outputFormats": defaut_output_formats, 4510 } 4511 4512 # Replace outputDirectory in output options 4513 output_options["outputDirectory"] = output_results 4514 output_options["outputFileName"] = "howard" 4515 4516 # Add outputOptions in analysis dict 4517 param_exomiser_analysis_dict["outputOptions"] = output_options 4518 4519 else: 4520 4521 # Replace output_results and output format (if exists in param) 4522 param_exomiser_analysis_dict["outputOptions"][ 4523 "outputDirectory" 4524 ] = output_results 4525 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4526 list( 4527 set( 4528 param_exomiser_analysis_dict.get( 4529 "outputOptions", {} 4530 ).get("outputFormats", []) 4531 + ["TSV_VARIANT", "VCF"] 4532 ) 4533 ) 4534 ) 4535 4536 # log 4537 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4538 4539 ### ANALYSIS FILE ### 4540 ##################### 4541 4542 ### Full JSON analysis config file ### 4543 4544 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4545 with open(exomiser_analysis, "w") as fp: 4546 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4547 4548 ### SPLIT analysis and sample config files 4549 4550 # Splitted analysis dict 4551 param_exomiser_analysis_dict_for_split = ( 4552 param_exomiser_analysis_dict.copy() 4553 ) 4554 4555 # Phenopacket JSON file 4556 exomiser_analysis_phenopacket = os.path.join( 4557 tmp_dir, "analysis_phenopacket.json" 4558 ) 4559 with open(exomiser_analysis_phenopacket, "w") as fp: 4560 json.dump( 4561 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4562 fp, 4563 indent=4, 4564 ) 4565 4566 # Analysis JSON file without Phenopacket parameters 4567 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4568 exomiser_analysis_analysis = os.path.join( 4569 tmp_dir, "analysis_analysis.json" 4570 ) 4571 with open(exomiser_analysis_analysis, "w") as fp: 4572 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4573 4574 ### INITAL VCF file ### 4575 ####################### 4576 4577 ### Create list of samples to use and include inti initial VCF file #### 4578 4579 # Subject (main sample) 4580 # Get sample ID in analysis dict 4581 sample_subject = ( 4582 param_exomiser_analysis_dict.get("phenopacket", {}) 4583 .get("subject", {}) 4584 .get("id", None) 4585 ) 4586 sample_proband = ( 4587 param_exomiser_analysis_dict.get("phenopacket", {}) 4588 .get("proband", {}) 4589 .get("subject", {}) 4590 .get("id", None) 4591 ) 4592 sample = [] 4593 if sample_subject: 4594 sample.append(sample_subject) 4595 if sample_proband: 4596 sample.append(sample_proband) 4597 4598 # Get sample ID within Pedigree 4599 pedigree_persons_list = ( 4600 param_exomiser_analysis_dict.get("phenopacket", {}) 4601 .get("pedigree", {}) 4602 .get("persons", {}) 4603 ) 4604 4605 # Create list with all sample ID in pedigree (if exists) 4606 pedigree_persons = [] 4607 for person in pedigree_persons_list: 4608 pedigree_persons.append(person.get("individualId")) 4609 4610 # Concat subject sample ID and samples ID in pedigreesamples 4611 samples = list(set(sample + pedigree_persons)) 4612 4613 # Check if sample list is not empty 4614 if not samples: 4615 log.error(f"No samples found") 4616 raise ValueError(f"No samples found") 4617 4618 # Create VCF with sample (either sample in param or first one by default) 4619 # Export VCF file 4620 self.export_variant_vcf( 4621 vcf_file=tmp_vcf_name, 4622 remove_info=True, 4623 add_samples=True, 4624 list_samples=samples, 4625 index=False, 4626 ) 4627 4628 ### Execute Exomiser ### 4629 ######################## 4630 4631 # Init command 4632 exomiser_command = "" 4633 4634 # Command exomiser options 4635 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4636 4637 # Release 4638 exomiser_release = param_exomiser.get("release", None) 4639 if exomiser_release: 4640 # phenotype data version 4641 exomiser_options += ( 4642 f" --exomiser.phenotype.data-version={exomiser_release} " 4643 ) 4644 # data version 4645 exomiser_options += ( 4646 f" --exomiser.{assembly}.data-version={exomiser_release} " 4647 ) 4648 # variant white list 4649 variant_white_list_file = ( 4650 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4651 ) 4652 if os.path.exists( 4653 os.path.join( 4654 databases_folders, assembly, variant_white_list_file 4655 ) 4656 ): 4657 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4658 4659 # transcript_source 4660 transcript_source = param_exomiser.get( 4661 "transcript_source", None 4662 ) # ucsc, refseq, ensembl 4663 if transcript_source: 4664 exomiser_options += ( 4665 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4666 ) 4667 4668 # If analysis contain proband param 4669 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4670 "proband", {} 4671 ): 4672 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4673 4674 # If no proband (usually uniq sample) 4675 else: 4676 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4677 4678 # Log 4679 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4680 4681 # Run command 4682 result = subprocess.call( 4683 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4684 ) 4685 if result: 4686 log.error("Exomiser command failed") 4687 raise ValueError("Exomiser command failed") 4688 4689 ### RESULTS ### 4690 ############### 4691 4692 ### Annotate with TSV fields ### 4693 4694 # Init result tsv file 4695 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4696 4697 # Init result tsv file 4698 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4699 4700 # Parse TSV file and explode columns in INFO field 4701 if exomiser_to_info and os.path.exists(output_results_tsv): 4702 4703 # Log 4704 log.debug("Exomiser columns to VCF INFO field") 4705 4706 # Retrieve columns and types 4707 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4708 output_results_tsv_df = self.get_query_to_df(query) 4709 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4710 4711 # Init concat fields for update 4712 sql_query_update_concat_fields = [] 4713 4714 # Fields to avoid 4715 fields_to_avoid = [ 4716 "CONTIG", 4717 "START", 4718 "END", 4719 "REF", 4720 "ALT", 4721 "QUAL", 4722 "FILTER", 4723 "GENOTYPE", 4724 ] 4725 4726 # List all columns to add into header 4727 for header_column in output_results_tsv_columns: 4728 4729 # If header column is enable 4730 if header_column not in fields_to_avoid: 4731 4732 # Header info type 4733 header_info_type = "String" 4734 header_column_df = output_results_tsv_df[header_column] 4735 header_column_df_dtype = header_column_df.dtype 4736 if header_column_df_dtype == object: 4737 if ( 4738 pd.to_numeric(header_column_df, errors="coerce") 4739 .notnull() 4740 .all() 4741 ): 4742 header_info_type = "Float" 4743 else: 4744 header_info_type = "Integer" 4745 4746 # Header info 4747 characters_to_validate = ["-"] 4748 pattern = "[" + "".join(characters_to_validate) + "]" 4749 header_info_name = re.sub( 4750 pattern, 4751 "_", 4752 f"Exomiser_{header_column}".replace("#", ""), 4753 ) 4754 header_info_number = "." 4755 header_info_description = ( 4756 f"Exomiser {header_column} annotation" 4757 ) 4758 header_info_source = "Exomiser" 4759 header_info_version = "unknown" 4760 header_info_code = CODE_TYPE_MAP[header_info_type] 4761 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4762 header_info_name, 4763 header_info_number, 4764 header_info_type, 4765 header_info_description, 4766 header_info_source, 4767 header_info_version, 4768 header_info_code, 4769 ) 4770 4771 # Add field to add for update to concat fields 4772 sql_query_update_concat_fields.append( 4773 f""" 4774 CASE 4775 WHEN table_parquet."{header_column}" NOT IN ('','.') 4776 THEN concat( 4777 '{header_info_name}=', 4778 table_parquet."{header_column}", 4779 ';' 4780 ) 4781 4782 ELSE '' 4783 END 4784 """ 4785 ) 4786 4787 # Update query 4788 sql_query_update = f""" 4789 UPDATE {table_variants} as table_variants 4790 SET INFO = concat( 4791 CASE 4792 WHEN INFO NOT IN ('', '.') 4793 THEN INFO 4794 ELSE '' 4795 END, 4796 CASE 4797 WHEN table_variants.INFO NOT IN ('','.') 4798 THEN ';' 4799 ELSE '' 4800 END, 4801 ( 4802 SELECT 4803 concat( 4804 {",".join(sql_query_update_concat_fields)} 4805 ) 4806 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4807 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4808 AND table_parquet.\"START\" = table_variants.\"POS\" 4809 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4810 AND table_parquet.\"REF\" = table_variants.\"REF\" 4811 ) 4812 ) 4813 ; 4814 """ 4815 4816 # Update 4817 self.conn.execute(sql_query_update) 4818 4819 ### Annotate with VCF INFO field ### 4820 4821 # Init result VCF file 4822 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4823 4824 # If VCF exists 4825 if os.path.exists(output_results_vcf): 4826 4827 # Log 4828 log.debug("Exomiser result VCF update variants") 4829 4830 # Find Exomiser INFO field annotation in header 4831 with gzip.open(output_results_vcf, "rt") as f: 4832 header_list = self.read_vcf_header(f) 4833 exomiser_vcf_header = vcf.Reader( 4834 io.StringIO("\n".join(header_list)) 4835 ) 4836 4837 # Add annotation INFO field to header 4838 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4839 4840 # Update variants with VCF 4841 self.update_from_vcf(output_results_vcf) 4842 4843 return True 4844 4845 def annotation_snpeff(self, threads: int = None) -> None: 4846 """ 4847 This function annotate with snpEff 4848 4849 :param threads: The number of threads to use 4850 :return: the value of the variable "return_value". 4851 """ 4852 4853 # DEBUG 4854 log.debug("Start annotation with snpeff databases") 4855 4856 # Threads 4857 if not threads: 4858 threads = self.get_threads() 4859 log.debug("Threads: " + str(threads)) 4860 4861 # DEBUG 4862 delete_tmp = True 4863 if self.get_config().get("verbosity", "warning") in ["debug"]: 4864 delete_tmp = False 4865 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4866 4867 # Config 4868 config = self.get_config() 4869 log.debug("Config: " + str(config)) 4870 4871 # Config - Folders - Databases 4872 databases_folders = ( 4873 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4874 ) 4875 log.debug("Databases annotations: " + str(databases_folders)) 4876 4877 # # Config - Java 4878 # java_bin = get_bin( 4879 # tool="java", 4880 # bin="java", 4881 # bin_type="bin", 4882 # config=config, 4883 # default_folder="/usr/bin", 4884 # ) 4885 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4886 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4887 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4888 4889 # # Config - snpEff bin 4890 # snpeff_jar = get_bin( 4891 # tool="snpeff", 4892 # bin="snpEff.jar", 4893 # bin_type="jar", 4894 # config=config, 4895 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4896 # ) 4897 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4898 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4899 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4900 4901 # Config - snpEff bin command 4902 snpeff_bin_command = get_bin_command( 4903 bin="snpEff.jar", 4904 tool="snpeff", 4905 bin_type="jar", 4906 config=config, 4907 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4908 ) 4909 if not snpeff_bin_command: 4910 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4911 log.error(msg_err) 4912 raise ValueError(msg_err) 4913 4914 # Config - snpEff databases 4915 snpeff_databases = ( 4916 config.get("folders", {}) 4917 .get("databases", {}) 4918 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4919 ) 4920 snpeff_databases = full_path(snpeff_databases) 4921 if snpeff_databases is not None and snpeff_databases != "": 4922 log.debug(f"Create snpEff databases folder") 4923 if not os.path.exists(snpeff_databases): 4924 os.makedirs(snpeff_databases) 4925 4926 # Param 4927 param = self.get_param() 4928 log.debug("Param: " + str(param)) 4929 4930 # Param 4931 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4932 log.debug("Options: " + str(options)) 4933 4934 # Param - Assembly 4935 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4936 4937 # Param - Options 4938 snpeff_options = ( 4939 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4940 ) 4941 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4942 snpeff_csvstats = ( 4943 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4944 ) 4945 if snpeff_stats: 4946 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4947 snpeff_stats = full_path(snpeff_stats) 4948 snpeff_options += f" -stats {snpeff_stats}" 4949 if snpeff_csvstats: 4950 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4951 snpeff_csvstats = full_path(snpeff_csvstats) 4952 snpeff_options += f" -csvStats {snpeff_csvstats}" 4953 4954 # Data 4955 table_variants = self.get_table_variants() 4956 4957 # Check if not empty 4958 log.debug("Check if not empty") 4959 sql_query_chromosomes = ( 4960 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4961 ) 4962 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4963 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4964 log.info(f"VCF empty") 4965 return 4966 4967 # Export in VCF 4968 log.debug("Create initial file to annotate") 4969 tmp_vcf = NamedTemporaryFile( 4970 prefix=self.get_prefix(), 4971 dir=self.get_tmp_dir(), 4972 suffix=".vcf.gz", 4973 delete=True, 4974 ) 4975 tmp_vcf_name = tmp_vcf.name 4976 4977 # VCF header 4978 vcf_reader = self.get_header() 4979 log.debug("Initial header: " + str(vcf_reader.infos)) 4980 4981 # Existing annotations 4982 for vcf_annotation in self.get_header().infos: 4983 4984 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4985 log.debug( 4986 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4987 ) 4988 4989 # Memory limit 4990 # if config.get("memory", None): 4991 # memory_limit = config.get("memory", "8G") 4992 # else: 4993 # memory_limit = "8G" 4994 memory_limit = self.get_memory("8G") 4995 log.debug(f"memory_limit: {memory_limit}") 4996 4997 # snpEff java options 4998 snpeff_java_options = ( 4999 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5000 ) 5001 log.debug(f"Exomiser java options: {snpeff_java_options}") 5002 5003 force_update_annotation = True 5004 5005 if "ANN" not in self.get_header().infos or force_update_annotation: 5006 5007 # Check snpEff database 5008 log.debug(f"Check snpEff databases {[assembly]}") 5009 databases_download_snpeff( 5010 folder=snpeff_databases, assemblies=[assembly], config=config 5011 ) 5012 5013 # Export VCF file 5014 self.export_variant_vcf( 5015 vcf_file=tmp_vcf_name, 5016 remove_info=True, 5017 add_samples=False, 5018 index=True, 5019 ) 5020 5021 # Tmp file 5022 err_files = [] 5023 tmp_annotate_vcf = NamedTemporaryFile( 5024 prefix=self.get_prefix(), 5025 dir=self.get_tmp_dir(), 5026 suffix=".vcf", 5027 delete=False, 5028 ) 5029 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5030 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5031 err_files.append(tmp_annotate_vcf_name_err) 5032 5033 # Command 5034 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5035 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5036 run_parallel_commands([snpeff_command], 1) 5037 5038 # Error messages 5039 log.info(f"Error/Warning messages:") 5040 error_message_command_all = [] 5041 error_message_command_warning = [] 5042 error_message_command_err = [] 5043 for err_file in err_files: 5044 with open(err_file, "r") as f: 5045 for line in f: 5046 message = line.strip() 5047 error_message_command_all.append(message) 5048 if line.startswith("[W::"): 5049 error_message_command_warning.append(message) 5050 if line.startswith("[E::"): 5051 error_message_command_err.append(f"{err_file}: " + message) 5052 # log info 5053 for message in list( 5054 set(error_message_command_err + error_message_command_warning) 5055 ): 5056 log.info(f" {message}") 5057 # debug info 5058 for message in list(set(error_message_command_all)): 5059 log.debug(f" {message}") 5060 # failed 5061 if len(error_message_command_err): 5062 log.error("Annotation failed: Error in commands") 5063 raise ValueError("Annotation failed: Error in commands") 5064 5065 # Find annotation in header 5066 with open(tmp_annotate_vcf_name, "rt") as f: 5067 header_list = self.read_vcf_header(f) 5068 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5069 5070 for ann in annovar_vcf_header.infos: 5071 if ann not in self.get_header().infos: 5072 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5073 5074 # Update variants 5075 log.info(f"Annotation - Updating...") 5076 self.update_from_vcf(tmp_annotate_vcf_name) 5077 5078 else: 5079 if "ANN" in self.get_header().infos: 5080 log.debug(f"Existing snpEff annotations in VCF") 5081 if force_update_annotation: 5082 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5083 5084 def annotation_annovar(self, threads: int = None) -> None: 5085 """ 5086 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5087 annotations 5088 5089 :param threads: number of threads to use 5090 :return: the value of the variable "return_value". 5091 """ 5092 5093 # DEBUG 5094 log.debug("Start annotation with Annovar databases") 5095 5096 # Threads 5097 if not threads: 5098 threads = self.get_threads() 5099 log.debug("Threads: " + str(threads)) 5100 5101 # Tmp en Err files 5102 tmp_files = [] 5103 err_files = [] 5104 5105 # DEBUG 5106 delete_tmp = True 5107 if self.get_config().get("verbosity", "warning") in ["debug"]: 5108 delete_tmp = False 5109 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5110 5111 # Config 5112 config = self.get_config() 5113 log.debug("Config: " + str(config)) 5114 5115 # Config - Folders - Databases 5116 databases_folders = ( 5117 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5118 ) 5119 log.debug("Databases annotations: " + str(databases_folders)) 5120 5121 # Config - annovar bin command 5122 annovar_bin_command = get_bin_command( 5123 bin="table_annovar.pl", 5124 tool="annovar", 5125 bin_type="perl", 5126 config=config, 5127 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5128 ) 5129 if not annovar_bin_command: 5130 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5131 log.error(msg_err) 5132 raise ValueError(msg_err) 5133 5134 # Config - BCFTools bin command 5135 bcftools_bin_command = get_bin_command( 5136 bin="bcftools", 5137 tool="bcftools", 5138 bin_type="bin", 5139 config=config, 5140 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5141 ) 5142 if not bcftools_bin_command: 5143 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5144 log.error(msg_err) 5145 raise ValueError(msg_err) 5146 5147 # Config - annovar databases 5148 annovar_databases = ( 5149 config.get("folders", {}) 5150 .get("databases", {}) 5151 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5152 ) 5153 annovar_databases = full_path(annovar_databases) 5154 if annovar_databases != "" and not os.path.exists(annovar_databases): 5155 os.makedirs(annovar_databases) 5156 5157 # Param 5158 param = self.get_param() 5159 log.debug("Param: " + str(param)) 5160 5161 # Param - options 5162 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5163 log.debug("Options: " + str(options)) 5164 5165 # Param - annotations 5166 annotations = ( 5167 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5168 ) 5169 log.debug("Annotations: " + str(annotations)) 5170 5171 # Param - Assembly 5172 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5173 5174 # Annovar database assembly 5175 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5176 if annovar_databases_assembly != "" and not os.path.exists( 5177 annovar_databases_assembly 5178 ): 5179 os.makedirs(annovar_databases_assembly) 5180 5181 # Data 5182 table_variants = self.get_table_variants() 5183 5184 # Check if not empty 5185 log.debug("Check if not empty") 5186 sql_query_chromosomes = ( 5187 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5188 ) 5189 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5190 if not sql_query_chromosomes_df["count"][0]: 5191 log.info(f"VCF empty") 5192 return 5193 5194 # VCF header 5195 vcf_reader = self.get_header() 5196 log.debug("Initial header: " + str(vcf_reader.infos)) 5197 5198 # Existing annotations 5199 for vcf_annotation in self.get_header().infos: 5200 5201 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5202 log.debug( 5203 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5204 ) 5205 5206 force_update_annotation = True 5207 5208 if annotations: 5209 5210 commands = [] 5211 tmp_annotates_vcf_name_list = [] 5212 5213 # Export in VCF 5214 log.debug("Create initial file to annotate") 5215 tmp_vcf = NamedTemporaryFile( 5216 prefix=self.get_prefix(), 5217 dir=self.get_tmp_dir(), 5218 suffix=".vcf.gz", 5219 delete=False, 5220 ) 5221 tmp_vcf_name = tmp_vcf.name 5222 tmp_files.append(tmp_vcf_name) 5223 tmp_files.append(tmp_vcf_name + ".tbi") 5224 5225 # Export VCF file 5226 self.export_variant_vcf( 5227 vcf_file=tmp_vcf_name, 5228 remove_info=".", 5229 add_samples=False, 5230 index=True, 5231 ) 5232 5233 # Create file for field rename 5234 log.debug("Create file for field rename") 5235 tmp_rename = NamedTemporaryFile( 5236 prefix=self.get_prefix(), 5237 dir=self.get_tmp_dir(), 5238 suffix=".rename", 5239 delete=False, 5240 ) 5241 tmp_rename_name = tmp_rename.name 5242 tmp_files.append(tmp_rename_name) 5243 5244 # Check Annovar database 5245 log.debug( 5246 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5247 ) 5248 databases_download_annovar( 5249 folder=annovar_databases, 5250 files=list(annotations.keys()), 5251 assemblies=[assembly], 5252 ) 5253 5254 for annotation in annotations: 5255 annotation_fields = annotations[annotation] 5256 5257 if not annotation_fields: 5258 annotation_fields = {"INFO": None} 5259 5260 log.info(f"Annotations Annovar - database '{annotation}'") 5261 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5262 5263 # Tmp file for annovar 5264 err_files = [] 5265 tmp_annotate_vcf_directory = TemporaryDirectory( 5266 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5267 ) 5268 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5269 tmp_annotate_vcf_name_annovar = ( 5270 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5271 ) 5272 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5273 err_files.append(tmp_annotate_vcf_name_err) 5274 tmp_files.append(tmp_annotate_vcf_name_err) 5275 5276 # Tmp file final vcf annotated by annovar 5277 tmp_annotate_vcf = NamedTemporaryFile( 5278 prefix=self.get_prefix(), 5279 dir=self.get_tmp_dir(), 5280 suffix=".vcf.gz", 5281 delete=False, 5282 ) 5283 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5284 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5285 tmp_files.append(tmp_annotate_vcf_name) 5286 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5287 5288 # Number of fields 5289 annotation_list = [] 5290 annotation_renamed_list = [] 5291 5292 for annotation_field in annotation_fields: 5293 5294 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5295 annotation_fields_new_name = annotation_fields.get( 5296 annotation_field, annotation_field 5297 ) 5298 if not annotation_fields_new_name: 5299 annotation_fields_new_name = annotation_field 5300 5301 if ( 5302 force_update_annotation 5303 or annotation_fields_new_name not in self.get_header().infos 5304 ): 5305 annotation_list.append(annotation_field) 5306 annotation_renamed_list.append(annotation_fields_new_name) 5307 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5308 log.warning( 5309 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5310 ) 5311 5312 # Add rename info 5313 run_parallel_commands( 5314 [ 5315 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5316 ], 5317 1, 5318 ) 5319 5320 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5321 log.debug("annotation_list: " + str(annotation_list)) 5322 5323 # protocol 5324 protocol = annotation 5325 5326 # argument 5327 argument = "" 5328 5329 # operation 5330 operation = "f" 5331 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5332 "ensGene" 5333 ): 5334 operation = "g" 5335 if options.get("genebase", None): 5336 argument = f"""'{options.get("genebase","")}'""" 5337 elif annotation in ["cytoBand"]: 5338 operation = "r" 5339 5340 # argument option 5341 argument_option = "" 5342 if argument != "": 5343 argument_option = " --argument " + argument 5344 5345 # command options 5346 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5347 for option in options: 5348 if option not in ["genebase"]: 5349 command_options += f""" --{option}={options[option]}""" 5350 5351 # Command 5352 5353 # Command - Annovar 5354 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5355 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5356 5357 # Command - start pipe 5358 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5359 5360 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5361 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5362 5363 # Command - Special characters (refGene annotation) 5364 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5365 5366 # Command - Clean empty fields (with value ".") 5367 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5368 5369 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5370 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5371 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5372 # for ann in annotation_renamed_list: 5373 for ann in annotation_list: 5374 annovar_fields_to_keep.append(f"^INFO/{ann}") 5375 5376 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5377 5378 # Command - indexing 5379 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5380 5381 log.debug(f"Annotation - Annovar command: {command_annovar}") 5382 run_parallel_commands([command_annovar], 1) 5383 5384 # Error messages 5385 log.info(f"Error/Warning messages:") 5386 error_message_command_all = [] 5387 error_message_command_warning = [] 5388 error_message_command_err = [] 5389 for err_file in err_files: 5390 with open(err_file, "r") as f: 5391 for line in f: 5392 message = line.strip() 5393 error_message_command_all.append(message) 5394 if line.startswith("[W::") or line.startswith("WARNING"): 5395 error_message_command_warning.append(message) 5396 if line.startswith("[E::") or line.startswith("ERROR"): 5397 error_message_command_err.append( 5398 f"{err_file}: " + message 5399 ) 5400 # log info 5401 for message in list( 5402 set(error_message_command_err + error_message_command_warning) 5403 ): 5404 log.info(f" {message}") 5405 # debug info 5406 for message in list(set(error_message_command_all)): 5407 log.debug(f" {message}") 5408 # failed 5409 if len(error_message_command_err): 5410 log.error("Annotation failed: Error in commands") 5411 raise ValueError("Annotation failed: Error in commands") 5412 5413 if tmp_annotates_vcf_name_list: 5414 5415 # List of annotated files 5416 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5417 5418 # Tmp file 5419 tmp_annotate_vcf = NamedTemporaryFile( 5420 prefix=self.get_prefix(), 5421 dir=self.get_tmp_dir(), 5422 suffix=".vcf.gz", 5423 delete=False, 5424 ) 5425 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5426 tmp_files.append(tmp_annotate_vcf_name) 5427 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5428 err_files.append(tmp_annotate_vcf_name_err) 5429 tmp_files.append(tmp_annotate_vcf_name_err) 5430 5431 # Command merge 5432 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5433 log.info( 5434 f"Annotation Annovar - Annotation merging " 5435 + str(len(tmp_annotates_vcf_name_list)) 5436 + " annotated files" 5437 ) 5438 log.debug(f"Annotation - merge command: {merge_command}") 5439 run_parallel_commands([merge_command], 1) 5440 5441 # Find annotation in header 5442 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5443 header_list = self.read_vcf_header(f) 5444 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5445 5446 for ann in annovar_vcf_header.infos: 5447 if ann not in self.get_header().infos: 5448 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5449 5450 # Update variants 5451 log.info(f"Annotation Annovar - Updating...") 5452 self.update_from_vcf(tmp_annotate_vcf_name) 5453 5454 # Clean files 5455 # Tmp file remove command 5456 if True: 5457 tmp_files_remove_command = "" 5458 if tmp_files: 5459 tmp_files_remove_command = " ".join(tmp_files) 5460 clean_command = f" rm -f {tmp_files_remove_command} " 5461 log.debug(f"Annotation Annovar - Annotation cleaning ") 5462 log.debug(f"Annotation - cleaning command: {clean_command}") 5463 run_parallel_commands([clean_command], 1) 5464 5465 # Parquet 5466 def annotation_parquet(self, threads: int = None) -> None: 5467 """ 5468 It takes a VCF file, and annotates it with a parquet file 5469 5470 :param threads: number of threads to use for the annotation 5471 :return: the value of the variable "result". 5472 """ 5473 5474 # DEBUG 5475 log.debug("Start annotation with parquet databases") 5476 5477 # Threads 5478 if not threads: 5479 threads = self.get_threads() 5480 log.debug("Threads: " + str(threads)) 5481 5482 # DEBUG 5483 delete_tmp = True 5484 if self.get_config().get("verbosity", "warning") in ["debug"]: 5485 delete_tmp = False 5486 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5487 5488 # Config 5489 databases_folders = set( 5490 self.get_config() 5491 .get("folders", {}) 5492 .get("databases", {}) 5493 .get("annotations", ["."]) 5494 + self.get_config() 5495 .get("folders", {}) 5496 .get("databases", {}) 5497 .get("parquet", ["."]) 5498 ) 5499 log.debug("Databases annotations: " + str(databases_folders)) 5500 5501 # Param 5502 annotations = ( 5503 self.get_param() 5504 .get("annotation", {}) 5505 .get("parquet", {}) 5506 .get("annotations", None) 5507 ) 5508 log.debug("Annotations: " + str(annotations)) 5509 5510 # Assembly 5511 assembly = self.get_param().get( 5512 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5513 ) 5514 5515 # Force Update Annotation 5516 force_update_annotation = ( 5517 self.get_param() 5518 .get("annotation", {}) 5519 .get("options", {}) 5520 .get("annotations_update", False) 5521 ) 5522 log.debug(f"force_update_annotation={force_update_annotation}") 5523 force_append_annotation = ( 5524 self.get_param() 5525 .get("annotation", {}) 5526 .get("options", {}) 5527 .get("annotations_append", False) 5528 ) 5529 log.debug(f"force_append_annotation={force_append_annotation}") 5530 5531 # Data 5532 table_variants = self.get_table_variants() 5533 5534 # Check if not empty 5535 log.debug("Check if not empty") 5536 sql_query_chromosomes_df = self.get_query_to_df( 5537 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5538 ) 5539 if not sql_query_chromosomes_df["count"][0]: 5540 log.info(f"VCF empty") 5541 return 5542 5543 # VCF header 5544 vcf_reader = self.get_header() 5545 log.debug("Initial header: " + str(vcf_reader.infos)) 5546 5547 # Nb Variants POS 5548 log.debug("NB Variants Start") 5549 nb_variants = self.conn.execute( 5550 f"SELECT count(*) AS count FROM variants" 5551 ).fetchdf()["count"][0] 5552 log.debug("NB Variants Stop") 5553 5554 # Existing annotations 5555 for vcf_annotation in self.get_header().infos: 5556 5557 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5558 log.debug( 5559 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5560 ) 5561 5562 # Added columns 5563 added_columns = [] 5564 5565 # drop indexes 5566 log.debug(f"Drop indexes...") 5567 self.drop_indexes() 5568 5569 if annotations: 5570 5571 if "ALL" in annotations: 5572 5573 all_param = annotations.get("ALL", {}) 5574 all_param_formats = all_param.get("formats", None) 5575 all_param_releases = all_param.get("releases", None) 5576 5577 databases_infos_dict = self.scan_databases( 5578 database_formats=all_param_formats, 5579 database_releases=all_param_releases, 5580 ) 5581 for database_infos in databases_infos_dict.keys(): 5582 if database_infos not in annotations: 5583 annotations[database_infos] = {"INFO": None} 5584 5585 for annotation in annotations: 5586 5587 if annotation in ["ALL"]: 5588 continue 5589 5590 # Annotation Name 5591 annotation_name = os.path.basename(annotation) 5592 5593 # Annotation fields 5594 annotation_fields = annotations[annotation] 5595 if not annotation_fields: 5596 annotation_fields = {"INFO": None} 5597 5598 log.debug(f"Annotation '{annotation_name}'") 5599 log.debug( 5600 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5601 ) 5602 5603 # Create Database 5604 database = Database( 5605 database=annotation, 5606 databases_folders=databases_folders, 5607 assembly=assembly, 5608 ) 5609 5610 # Find files 5611 parquet_file = database.get_database() 5612 parquet_hdr_file = database.get_header_file() 5613 parquet_type = database.get_type() 5614 5615 # Check if files exists 5616 if not parquet_file or not parquet_hdr_file: 5617 log.error("Annotation failed: file not found") 5618 raise ValueError("Annotation failed: file not found") 5619 else: 5620 # Get parquet connexion 5621 parquet_sql_attach = database.get_sql_database_attach( 5622 output="query" 5623 ) 5624 if parquet_sql_attach: 5625 self.conn.execute(parquet_sql_attach) 5626 parquet_file_link = database.get_sql_database_link() 5627 # Log 5628 log.debug( 5629 f"Annotation '{annotation_name}' - file: " 5630 + str(parquet_file) 5631 + " and " 5632 + str(parquet_hdr_file) 5633 ) 5634 5635 # Database full header columns 5636 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5637 parquet_hdr_file 5638 ) 5639 # Log 5640 log.debug( 5641 "Annotation database header columns : " 5642 + str(parquet_hdr_vcf_header_columns) 5643 ) 5644 5645 # Load header as VCF object 5646 parquet_hdr_vcf_header_infos = database.get_header().infos 5647 # Log 5648 log.debug( 5649 "Annotation database header: " 5650 + str(parquet_hdr_vcf_header_infos) 5651 ) 5652 5653 # Get extra infos 5654 parquet_columns = database.get_extra_columns() 5655 # Log 5656 log.debug("Annotation database Columns: " + str(parquet_columns)) 5657 5658 # Add extra columns if "ALL" in annotation_fields 5659 # if "ALL" in annotation_fields: 5660 # allow_add_extra_column = True 5661 if "ALL" in annotation_fields and database.get_extra_columns(): 5662 for extra_column in database.get_extra_columns(): 5663 if ( 5664 extra_column not in annotation_fields 5665 and extra_column.replace("INFO/", "") 5666 not in parquet_hdr_vcf_header_infos 5667 ): 5668 parquet_hdr_vcf_header_infos[extra_column] = ( 5669 vcf.parser._Info( 5670 extra_column, 5671 ".", 5672 "String", 5673 f"{extra_column} description", 5674 "unknown", 5675 "unknown", 5676 self.code_type_map["String"], 5677 ) 5678 ) 5679 5680 # For all fields in database 5681 annotation_fields_all = False 5682 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5683 annotation_fields_all = True 5684 annotation_fields = { 5685 key: key for key in parquet_hdr_vcf_header_infos 5686 } 5687 5688 log.debug( 5689 "Annotation database header - All annotations added: " 5690 + str(annotation_fields) 5691 ) 5692 5693 # Init 5694 5695 # List of annotation fields to use 5696 sql_query_annotation_update_info_sets = [] 5697 5698 # List of annotation to agregate 5699 sql_query_annotation_to_agregate = [] 5700 5701 # Number of fields 5702 nb_annotation_field = 0 5703 5704 # Annotation fields processed 5705 annotation_fields_processed = [] 5706 5707 # Columns mapping 5708 map_columns = database.map_columns( 5709 columns=annotation_fields, prefixes=["INFO/"] 5710 ) 5711 5712 # Query dict for fields to remove (update option) 5713 query_dict_remove = {} 5714 5715 # Fetch Anotation fields 5716 for annotation_field in annotation_fields: 5717 5718 # annotation_field_column 5719 annotation_field_column = map_columns.get( 5720 annotation_field, "INFO" 5721 ) 5722 5723 # field new name, if parametered 5724 annotation_fields_new_name = annotation_fields.get( 5725 annotation_field, annotation_field 5726 ) 5727 if not annotation_fields_new_name: 5728 annotation_fields_new_name = annotation_field 5729 5730 # To annotate 5731 # force_update_annotation = True 5732 # force_append_annotation = True 5733 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5734 if annotation_field in parquet_hdr_vcf_header_infos and ( 5735 force_update_annotation 5736 or force_append_annotation 5737 or ( 5738 annotation_fields_new_name 5739 not in self.get_header().infos 5740 ) 5741 ): 5742 5743 # Add field to annotation to process list 5744 annotation_fields_processed.append( 5745 annotation_fields_new_name 5746 ) 5747 5748 # explode infos for the field 5749 annotation_fields_new_name_info_msg = "" 5750 if ( 5751 force_update_annotation 5752 and annotation_fields_new_name 5753 in self.get_header().infos 5754 ): 5755 # Remove field from INFO 5756 query = f""" 5757 UPDATE {table_variants} as table_variants 5758 SET INFO = REGEXP_REPLACE( 5759 concat(table_variants.INFO,''), 5760 ';*{annotation_fields_new_name}=[^;]*', 5761 '' 5762 ) 5763 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5764 """ 5765 annotation_fields_new_name_info_msg = " [update]" 5766 query_dict_remove[ 5767 f"remove 'INFO/{annotation_fields_new_name}'" 5768 ] = query 5769 5770 # Sep between fields in INFO 5771 nb_annotation_field += 1 5772 if nb_annotation_field > 1: 5773 annotation_field_sep = ";" 5774 else: 5775 annotation_field_sep = "" 5776 5777 log.info( 5778 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5779 ) 5780 5781 # Add INFO field to header 5782 parquet_hdr_vcf_header_infos_number = ( 5783 parquet_hdr_vcf_header_infos[annotation_field].num 5784 or "." 5785 ) 5786 parquet_hdr_vcf_header_infos_type = ( 5787 parquet_hdr_vcf_header_infos[annotation_field].type 5788 or "String" 5789 ) 5790 parquet_hdr_vcf_header_infos_description = ( 5791 parquet_hdr_vcf_header_infos[annotation_field].desc 5792 or f"{annotation_field} description" 5793 ) 5794 parquet_hdr_vcf_header_infos_source = ( 5795 parquet_hdr_vcf_header_infos[annotation_field].source 5796 or "unknown" 5797 ) 5798 parquet_hdr_vcf_header_infos_version = ( 5799 parquet_hdr_vcf_header_infos[annotation_field].version 5800 or "unknown" 5801 ) 5802 5803 vcf_reader.infos[annotation_fields_new_name] = ( 5804 vcf.parser._Info( 5805 annotation_fields_new_name, 5806 parquet_hdr_vcf_header_infos_number, 5807 parquet_hdr_vcf_header_infos_type, 5808 parquet_hdr_vcf_header_infos_description, 5809 parquet_hdr_vcf_header_infos_source, 5810 parquet_hdr_vcf_header_infos_version, 5811 self.code_type_map[ 5812 parquet_hdr_vcf_header_infos_type 5813 ], 5814 ) 5815 ) 5816 5817 # Append 5818 if force_append_annotation: 5819 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5820 else: 5821 query_case_when_append = "" 5822 5823 # Annotation/Update query fields 5824 # Found in INFO column 5825 if ( 5826 annotation_field_column == "INFO" 5827 and "INFO" in parquet_hdr_vcf_header_columns 5828 ): 5829 sql_query_annotation_update_info_sets.append( 5830 f""" 5831 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5832 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5833 ELSE '' 5834 END 5835 """ 5836 ) 5837 # Found in a specific column 5838 else: 5839 sql_query_annotation_update_info_sets.append( 5840 f""" 5841 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 5842 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 5843 ELSE '' 5844 END 5845 """ 5846 ) 5847 sql_query_annotation_to_agregate.append( 5848 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5849 ) 5850 5851 # Not to annotate 5852 else: 5853 5854 if force_update_annotation: 5855 annotation_message = "forced" 5856 else: 5857 annotation_message = "skipped" 5858 5859 if annotation_field not in parquet_hdr_vcf_header_infos: 5860 log.warning( 5861 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5862 ) 5863 if annotation_fields_new_name in self.get_header().infos: 5864 log.warning( 5865 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5866 ) 5867 5868 # Check if ALL fields have to be annotated. Thus concat all INFO field 5869 # allow_annotation_full_info = True 5870 allow_annotation_full_info = not force_append_annotation 5871 5872 if parquet_type in ["regions"]: 5873 allow_annotation_full_info = False 5874 5875 if ( 5876 allow_annotation_full_info 5877 and nb_annotation_field == len(annotation_fields) 5878 and annotation_fields_all 5879 and ( 5880 "INFO" in parquet_hdr_vcf_header_columns 5881 and "INFO" in database.get_extra_columns() 5882 ) 5883 ): 5884 log.debug("Column INFO annotation enabled") 5885 sql_query_annotation_update_info_sets = [] 5886 sql_query_annotation_update_info_sets.append( 5887 f" table_parquet.INFO " 5888 ) 5889 5890 if sql_query_annotation_update_info_sets: 5891 5892 # Annotate 5893 log.info(f"Annotation '{annotation_name}' - Annotation...") 5894 5895 # Join query annotation update info sets for SQL 5896 sql_query_annotation_update_info_sets_sql = ",".join( 5897 sql_query_annotation_update_info_sets 5898 ) 5899 5900 # Check chromosomes list (and variants infos) 5901 sql_query_chromosomes = f""" 5902 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5903 FROM {table_variants} as table_variants 5904 GROUP BY table_variants."#CHROM" 5905 ORDER BY table_variants."#CHROM" 5906 """ 5907 sql_query_chromosomes_df = self.conn.execute( 5908 sql_query_chromosomes 5909 ).df() 5910 sql_query_chromosomes_dict = { 5911 entry["CHROM"]: { 5912 "count": entry["count_variants"], 5913 "min": entry["min_variants"], 5914 "max": entry["max_variants"], 5915 } 5916 for index, entry in sql_query_chromosomes_df.iterrows() 5917 } 5918 5919 # Init 5920 nb_of_query = 0 5921 nb_of_variant_annotated = 0 5922 query_dict = query_dict_remove 5923 5924 # for chrom in sql_query_chromosomes_df["CHROM"]: 5925 for chrom in sql_query_chromosomes_dict: 5926 5927 # Number of variant by chromosome 5928 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5929 chrom, {} 5930 ).get("count", 0) 5931 5932 log.debug( 5933 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5934 ) 5935 5936 # Annotation with regions database 5937 if parquet_type in ["regions"]: 5938 sql_query_annotation_from_clause = f""" 5939 FROM ( 5940 SELECT 5941 '{chrom}' AS \"#CHROM\", 5942 table_variants_from.\"POS\" AS \"POS\", 5943 {",".join(sql_query_annotation_to_agregate)} 5944 FROM {table_variants} as table_variants_from 5945 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5946 table_parquet_from."#CHROM" = '{chrom}' 5947 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5948 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5949 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5950 ) 5951 ) 5952 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5953 GROUP BY table_variants_from.\"POS\" 5954 ) 5955 as table_parquet 5956 """ 5957 5958 sql_query_annotation_where_clause = """ 5959 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5960 AND table_parquet.\"POS\" = table_variants.\"POS\" 5961 """ 5962 5963 # Annotation with variants database 5964 else: 5965 sql_query_annotation_from_clause = f""" 5966 FROM {parquet_file_link} as table_parquet 5967 """ 5968 sql_query_annotation_where_clause = f""" 5969 table_variants."#CHROM" = '{chrom}' 5970 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5971 AND table_parquet.\"POS\" = table_variants.\"POS\" 5972 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5973 AND table_parquet.\"REF\" = table_variants.\"REF\" 5974 """ 5975 5976 # Create update query 5977 sql_query_annotation_chrom_interval_pos = f""" 5978 UPDATE {table_variants} as table_variants 5979 SET INFO = 5980 concat( 5981 CASE WHEN table_variants.INFO NOT IN ('','.') 5982 THEN table_variants.INFO 5983 ELSE '' 5984 END 5985 , 5986 CASE WHEN table_variants.INFO NOT IN ('','.') 5987 AND ( 5988 concat({sql_query_annotation_update_info_sets_sql}) 5989 ) 5990 NOT IN ('','.') 5991 THEN ';' 5992 ELSE '' 5993 END 5994 , 5995 {sql_query_annotation_update_info_sets_sql} 5996 ) 5997 {sql_query_annotation_from_clause} 5998 WHERE {sql_query_annotation_where_clause} 5999 ; 6000 """ 6001 6002 # Add update query to dict 6003 query_dict[ 6004 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6005 ] = sql_query_annotation_chrom_interval_pos 6006 6007 nb_of_query = len(query_dict) 6008 num_query = 0 6009 6010 # SET max_expression_depth TO x 6011 self.conn.execute("SET max_expression_depth TO 10000") 6012 6013 for query_name in query_dict: 6014 query = query_dict[query_name] 6015 num_query += 1 6016 log.info( 6017 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6018 ) 6019 result = self.conn.execute(query) 6020 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6021 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6022 log.info( 6023 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6024 ) 6025 6026 log.info( 6027 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6028 ) 6029 6030 else: 6031 6032 log.info( 6033 f"Annotation '{annotation_name}' - No Annotations available" 6034 ) 6035 6036 log.debug("Final header: " + str(vcf_reader.infos)) 6037 6038 # Remove added columns 6039 for added_column in added_columns: 6040 self.drop_column(column=added_column) 6041 6042 def annotation_splice(self, threads: int = None) -> None: 6043 """ 6044 This function annotate with snpEff 6045 6046 :param threads: The number of threads to use 6047 :return: the value of the variable "return_value". 6048 """ 6049 6050 # DEBUG 6051 log.debug("Start annotation with splice tools") 6052 6053 # Threads 6054 if not threads: 6055 threads = self.get_threads() 6056 log.debug("Threads: " + str(threads)) 6057 6058 # DEBUG 6059 delete_tmp = True 6060 if self.get_config().get("verbosity", "warning") in ["debug"]: 6061 delete_tmp = False 6062 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6063 6064 # Config 6065 config = self.get_config() 6066 log.debug("Config: " + str(config)) 6067 splice_config = config.get("tools", {}).get("splice", {}) 6068 if not splice_config: 6069 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6070 if not splice_config: 6071 msg_err = "No Splice tool config" 6072 log.error(msg_err) 6073 raise ValueError(msg_err) 6074 log.debug(f"splice_config={splice_config}") 6075 6076 # Config - Folders - Databases 6077 databases_folders = ( 6078 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6079 ) 6080 log.debug("Databases annotations: " + str(databases_folders)) 6081 6082 # Splice docker image 6083 splice_docker_image = splice_config.get("docker").get("image") 6084 6085 # Pull splice image if it's not already there 6086 if not check_docker_image_exists(splice_docker_image): 6087 log.warning( 6088 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6089 ) 6090 try: 6091 command(f"docker pull {splice_config.get('docker').get('image')}") 6092 except subprocess.CalledProcessError: 6093 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6094 log.error(msg_err) 6095 raise ValueError(msg_err) 6096 return None 6097 6098 # Config - splice databases 6099 splice_databases = ( 6100 config.get("folders", {}) 6101 .get("databases", {}) 6102 .get("splice", DEFAULT_SPLICE_FOLDER) 6103 ) 6104 splice_databases = full_path(splice_databases) 6105 6106 # Param 6107 param = self.get_param() 6108 log.debug("Param: " + str(param)) 6109 6110 # Param 6111 options = param.get("annotation", {}).get("splice", {}) 6112 log.debug("Options: " + str(options)) 6113 6114 # Data 6115 table_variants = self.get_table_variants() 6116 6117 # Check if not empty 6118 log.debug("Check if not empty") 6119 sql_query_chromosomes = ( 6120 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6121 ) 6122 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6123 log.info("VCF empty") 6124 return None 6125 6126 # Export in VCF 6127 log.debug("Create initial file to annotate") 6128 6129 # Create output folder 6130 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6131 if not os.path.exists(output_folder): 6132 Path(output_folder).mkdir(parents=True, exist_ok=True) 6133 6134 # Create tmp VCF file 6135 tmp_vcf = NamedTemporaryFile( 6136 prefix=self.get_prefix(), 6137 dir=output_folder, 6138 suffix=".vcf", 6139 delete=False, 6140 ) 6141 tmp_vcf_name = tmp_vcf.name 6142 6143 # VCF header 6144 header = self.get_header() 6145 6146 # Existing annotations 6147 for vcf_annotation in self.get_header().infos: 6148 6149 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6150 log.debug( 6151 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6152 ) 6153 6154 # Memory limit 6155 if config.get("memory", None): 6156 memory_limit = config.get("memory", "8G").upper() 6157 # upper() 6158 else: 6159 memory_limit = "8G" 6160 log.debug(f"memory_limit: {memory_limit}") 6161 6162 # Check number of variants to annotate 6163 where_clause_regex_spliceai = r"SpliceAI_\w+" 6164 where_clause_regex_spip = r"SPiP_\w+" 6165 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6166 df_list_of_variants_to_annotate = self.get_query_to_df( 6167 query=f""" SELECT * FROM variants {where_clause} """ 6168 ) 6169 if len(df_list_of_variants_to_annotate) == 0: 6170 log.warning( 6171 f"No variants to annotate with splice. Variants probably already annotated with splice" 6172 ) 6173 return None 6174 else: 6175 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6176 6177 # Export VCF file 6178 self.export_variant_vcf( 6179 vcf_file=tmp_vcf_name, 6180 remove_info=True, 6181 add_samples=True, 6182 index=False, 6183 where_clause=where_clause, 6184 ) 6185 6186 # Create docker container and launch splice analysis 6187 if splice_config: 6188 6189 # Splice mount folders 6190 mount_folders = splice_config.get("mount", {}) 6191 6192 # Genome mount 6193 mount_folders[ 6194 config.get("folders", {}) 6195 .get("databases", {}) 6196 .get("genomes", DEFAULT_GENOME_FOLDER) 6197 ] = "ro" 6198 6199 # SpliceAI mount 6200 mount_folders[ 6201 config.get("folders", {}) 6202 .get("databases", {}) 6203 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6204 ] = "ro" 6205 6206 # Genome mount 6207 mount_folders[ 6208 config.get("folders", {}) 6209 .get("databases", {}) 6210 .get("spip", DEFAULT_SPIP_FOLDER) 6211 ] = "ro" 6212 6213 # Mount folders 6214 mount = [] 6215 6216 # Config mount 6217 mount = [ 6218 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6219 for path, mode in mount_folders.items() 6220 ] 6221 6222 if any(value for value in splice_config.values() if value is None): 6223 log.warning("At least one splice config parameter is empty") 6224 return None 6225 6226 # Params in splice nf 6227 def check_values(dico: dict): 6228 """ 6229 Ensure parameters for NF splice pipeline 6230 """ 6231 for key, val in dico.items(): 6232 if key == "genome": 6233 if any( 6234 assemb in options.get("genome", {}) 6235 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6236 ): 6237 yield f"--{key} hg19" 6238 elif any( 6239 assemb in options.get("genome", {}) 6240 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6241 ): 6242 yield f"--{key} hg38" 6243 elif ( 6244 (isinstance(val, str) and val) 6245 or isinstance(val, int) 6246 or isinstance(val, bool) 6247 ): 6248 yield f"--{key} {val}" 6249 6250 # Genome 6251 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6252 options["genome"] = genome 6253 6254 # NF params 6255 nf_params = [] 6256 6257 # Add options 6258 if options: 6259 nf_params = list(check_values(options)) 6260 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6261 else: 6262 log.debug("No NF params provided") 6263 6264 # Add threads 6265 if "threads" not in options.keys(): 6266 nf_params.append(f"--threads {threads}") 6267 6268 # Genome path 6269 genome_path = find_genome( 6270 config.get("folders", {}) 6271 .get("databases", {}) 6272 .get("genomes", DEFAULT_GENOME_FOLDER), 6273 file=f"{genome}.fa", 6274 ) 6275 # Add genome path 6276 if not genome_path: 6277 raise ValueError( 6278 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6279 ) 6280 else: 6281 log.debug(f"Genome: {genome_path}") 6282 nf_params.append(f"--genome_path {genome_path}") 6283 6284 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6285 """ 6286 Setting up updated databases for SPiP and SpliceAI 6287 """ 6288 6289 try: 6290 6291 # SpliceAI assembly transcriptome 6292 spliceai_assembly = os.path.join( 6293 config.get("folders", {}) 6294 .get("databases", {}) 6295 .get("spliceai", {}), 6296 options.get("genome"), 6297 "transcriptome", 6298 ) 6299 spip_assembly = options.get("genome") 6300 6301 spip = find( 6302 f"transcriptome_{spip_assembly}.RData", 6303 config.get("folders", {}).get("databases", {}).get("spip", {}), 6304 ) 6305 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6306 log.debug(f"SPiP annotations: {spip}") 6307 log.debug(f"SpliceAI annotations: {spliceai}") 6308 if spip and spliceai: 6309 return [ 6310 f"--spip_transcriptome {spip}", 6311 f"--spliceai_annotations {spliceai}", 6312 ] 6313 else: 6314 # TODO crash and go on with basic annotations ? 6315 # raise ValueError( 6316 # "Can't find splice databases in configuration EXIT" 6317 # ) 6318 log.warning( 6319 "Can't find splice databases in configuration, use annotations file from image" 6320 ) 6321 except TypeError: 6322 log.warning( 6323 "Can't find splice databases in configuration, use annotations file from image" 6324 ) 6325 return [] 6326 6327 # Add options, check if transcriptome option have already beend provided 6328 if ( 6329 "spip_transcriptome" not in nf_params 6330 and "spliceai_transcriptome" not in nf_params 6331 ): 6332 splice_reference = splice_annotations(options, config) 6333 if splice_reference: 6334 nf_params.extend(splice_reference) 6335 6336 nf_params.append(f"--output_folder {output_folder}") 6337 6338 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6339 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6340 log.debug(cmd) 6341 6342 splice_config["docker"]["command"] = cmd 6343 6344 docker_cmd = get_bin_command( 6345 tool="splice", 6346 bin_type="docker", 6347 config=config, 6348 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6349 add_options=f"--name {random_uuid} {' '.join(mount)}", 6350 ) 6351 6352 # Docker debug 6353 # if splice_config.get("rm_container"): 6354 # rm_container = "--rm" 6355 # else: 6356 # rm_container = "" 6357 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6358 6359 log.debug(docker_cmd) 6360 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6361 log.debug(res.stdout) 6362 if res.stderr: 6363 log.error(res.stderr) 6364 res.check_returncode() 6365 else: 6366 log.warning(f"Splice tool configuration not found: {config}") 6367 6368 # Update variants 6369 log.info("Annotation - Updating...") 6370 # Test find output vcf 6371 log.debug( 6372 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6373 ) 6374 output_vcf = [] 6375 # Wrong folder to look in 6376 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6377 if ( 6378 files 6379 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6380 ): 6381 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6382 # log.debug(os.listdir(options.get("output_folder"))) 6383 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6384 if not output_vcf: 6385 log.debug( 6386 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6387 ) 6388 else: 6389 # Get new header from annotated vcf 6390 log.debug(f"Initial header: {len(header.infos)} fields") 6391 # Create new header with splice infos 6392 new_vcf = Variants(input=output_vcf[0]) 6393 new_vcf_header = new_vcf.get_header().infos 6394 for keys, infos in new_vcf_header.items(): 6395 if keys not in header.infos.keys(): 6396 header.infos[keys] = infos 6397 log.debug(f"New header: {len(header.infos)} fields") 6398 log.debug(f"Splice tmp output: {output_vcf[0]}") 6399 self.update_from_vcf(output_vcf[0]) 6400 6401 # Remove folder 6402 remove_if_exists(output_folder) 6403 6404 ### 6405 # Prioritization 6406 ### 6407 6408 def get_config_default(self, name: str) -> dict: 6409 """ 6410 The function `get_config_default` returns a dictionary containing default configurations for 6411 various calculations and prioritizations. 6412 6413 :param name: The `get_config_default` function returns a dictionary containing default 6414 configurations for different calculations and prioritizations. The `name` parameter is used to 6415 specify which specific configuration to retrieve from the dictionary 6416 :type name: str 6417 :return: The function `get_config_default` returns a dictionary containing default configuration 6418 settings for different calculations and prioritizations. The specific configuration settings are 6419 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6420 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6421 returned. If there is no match, an empty dictionary is returned. 6422 """ 6423 6424 config_default = { 6425 "calculations": { 6426 "variant_chr_pos_alt_ref": { 6427 "type": "sql", 6428 "name": "variant_chr_pos_alt_ref", 6429 "description": "Create a variant ID with chromosome, position, alt and ref", 6430 "available": False, 6431 "output_column_name": "variant_chr_pos_alt_ref", 6432 "output_column_type": "String", 6433 "output_column_description": "variant ID with chromosome, position, alt and ref", 6434 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6435 "operation_info": True, 6436 }, 6437 "VARTYPE": { 6438 "type": "sql", 6439 "name": "VARTYPE", 6440 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6441 "available": True, 6442 "output_column_name": "VARTYPE", 6443 "output_column_type": "String", 6444 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6445 "operation_query": """ 6446 CASE 6447 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6448 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6449 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6450 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6451 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6452 ELSE 'UNDEFINED' 6453 END 6454 """, 6455 "info_fields": ["SVTYPE"], 6456 "operation_info": True, 6457 }, 6458 "snpeff_hgvs": { 6459 "type": "python", 6460 "name": "snpeff_hgvs", 6461 "description": "HGVS nomenclatures from snpEff annotation", 6462 "available": True, 6463 "function_name": "calculation_extract_snpeff_hgvs", 6464 "function_params": ["snpeff_hgvs", "ANN"], 6465 }, 6466 "snpeff_ann_explode": { 6467 "type": "python", 6468 "name": "snpeff_ann_explode", 6469 "description": "Explode snpEff annotations with uniquify values", 6470 "available": True, 6471 "function_name": "calculation_snpeff_ann_explode", 6472 "function_params": [False, "fields", "snpeff_", "ANN"], 6473 }, 6474 "snpeff_ann_explode_uniquify": { 6475 "type": "python", 6476 "name": "snpeff_ann_explode_uniquify", 6477 "description": "Explode snpEff annotations", 6478 "available": True, 6479 "function_name": "calculation_snpeff_ann_explode", 6480 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6481 }, 6482 "snpeff_ann_explode_json": { 6483 "type": "python", 6484 "name": "snpeff_ann_explode_json", 6485 "description": "Explode snpEff annotations in JSON format", 6486 "available": True, 6487 "function_name": "calculation_snpeff_ann_explode", 6488 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6489 }, 6490 "NOMEN": { 6491 "type": "python", 6492 "name": "NOMEN", 6493 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6494 "available": True, 6495 "function_name": "calculation_extract_nomen", 6496 "function_params": [], 6497 }, 6498 "FINDBYPIPELINE": { 6499 "type": "python", 6500 "name": "FINDBYPIPELINE", 6501 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6502 "available": True, 6503 "function_name": "calculation_find_by_pipeline", 6504 "function_params": ["findbypipeline"], 6505 }, 6506 "FINDBYSAMPLE": { 6507 "type": "python", 6508 "name": "FINDBYSAMPLE", 6509 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6510 "available": True, 6511 "function_name": "calculation_find_by_pipeline", 6512 "function_params": ["findbysample"], 6513 }, 6514 "GENOTYPECONCORDANCE": { 6515 "type": "python", 6516 "name": "GENOTYPECONCORDANCE", 6517 "description": "Concordance of genotype for multi caller VCF", 6518 "available": True, 6519 "function_name": "calculation_genotype_concordance", 6520 "function_params": [], 6521 }, 6522 "BARCODE": { 6523 "type": "python", 6524 "name": "BARCODE", 6525 "description": "BARCODE as VaRank tool", 6526 "available": True, 6527 "function_name": "calculation_barcode", 6528 "function_params": [], 6529 }, 6530 "BARCODEFAMILY": { 6531 "type": "python", 6532 "name": "BARCODEFAMILY", 6533 "description": "BARCODEFAMILY as VaRank tool", 6534 "available": True, 6535 "function_name": "calculation_barcode_family", 6536 "function_params": ["BCF"], 6537 }, 6538 "TRIO": { 6539 "type": "python", 6540 "name": "TRIO", 6541 "description": "Inheritance for a trio family", 6542 "available": True, 6543 "function_name": "calculation_trio", 6544 "function_params": [], 6545 }, 6546 "VAF": { 6547 "type": "python", 6548 "name": "VAF", 6549 "description": "Variant Allele Frequency (VAF) harmonization", 6550 "available": True, 6551 "function_name": "calculation_vaf_normalization", 6552 "function_params": [], 6553 }, 6554 "VAF_stats": { 6555 "type": "python", 6556 "name": "VAF_stats", 6557 "description": "Variant Allele Frequency (VAF) statistics", 6558 "available": True, 6559 "function_name": "calculation_genotype_stats", 6560 "function_params": ["VAF"], 6561 }, 6562 "DP_stats": { 6563 "type": "python", 6564 "name": "DP_stats", 6565 "description": "Depth (DP) statistics", 6566 "available": True, 6567 "function_name": "calculation_genotype_stats", 6568 "function_params": ["DP"], 6569 }, 6570 "variant_id": { 6571 "type": "python", 6572 "name": "variant_id", 6573 "description": "Variant ID generated from variant position and type", 6574 "available": True, 6575 "function_name": "calculation_variant_id", 6576 "function_params": [], 6577 }, 6578 "transcripts_json": { 6579 "type": "python", 6580 "name": "transcripts_json", 6581 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6582 "available": True, 6583 "function_name": "calculation_transcripts_annotation", 6584 "function_params": ["transcripts_json", None], 6585 }, 6586 "transcripts_ann": { 6587 "type": "python", 6588 "name": "transcripts_ann", 6589 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6590 "available": True, 6591 "function_name": "calculation_transcripts_annotation", 6592 "function_params": [None, "transcripts_ann"], 6593 }, 6594 "transcripts_annotations": { 6595 "type": "python", 6596 "name": "transcripts_annotations", 6597 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6598 "available": True, 6599 "function_name": "calculation_transcripts_annotation", 6600 "function_params": [None, None], 6601 }, 6602 "transcripts_prioritization": { 6603 "type": "python", 6604 "name": "transcripts_prioritization", 6605 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6606 "available": True, 6607 "function_name": "calculation_transcripts_prioritization", 6608 "function_params": [], 6609 }, 6610 }, 6611 "prioritizations": { 6612 "default": { 6613 "ANN2": [ 6614 { 6615 "type": "contains", 6616 "value": "HIGH", 6617 "score": 5, 6618 "flag": "PASS", 6619 "comment": [ 6620 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6621 ], 6622 }, 6623 { 6624 "type": "contains", 6625 "value": "MODERATE", 6626 "score": 3, 6627 "flag": "PASS", 6628 "comment": [ 6629 "A non-disruptive variant that might change protein effectiveness" 6630 ], 6631 }, 6632 { 6633 "type": "contains", 6634 "value": "LOW", 6635 "score": 0, 6636 "flag": "FILTERED", 6637 "comment": [ 6638 "Assumed to be mostly harmless or unlikely to change protein behavior" 6639 ], 6640 }, 6641 { 6642 "type": "contains", 6643 "value": "MODIFIER", 6644 "score": 0, 6645 "flag": "FILTERED", 6646 "comment": [ 6647 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6648 ], 6649 }, 6650 ], 6651 } 6652 }, 6653 } 6654 6655 return config_default.get(name, None) 6656 6657 def get_config_json( 6658 self, name: str, config_dict: dict = {}, config_file: str = None 6659 ) -> dict: 6660 """ 6661 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6662 default values, a dictionary, and a file. 6663 6664 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6665 the name of the configuration. It is used to identify and retrieve the configuration settings 6666 for a specific component or module 6667 :type name: str 6668 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6669 dictionary that allows you to provide additional configuration settings or overrides. When you 6670 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6671 the key is the configuration setting you want to override or 6672 :type config_dict: dict 6673 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6674 specify the path to a configuration file that contains additional settings. If provided, the 6675 function will read the contents of this file and update the configuration dictionary with the 6676 values found in the file, overriding any existing values with the 6677 :type config_file: str 6678 :return: The function `get_config_json` returns a dictionary containing the configuration 6679 settings. 6680 """ 6681 6682 # Create with default prioritizations 6683 config_default = self.get_config_default(name=name) 6684 configuration = config_default 6685 # log.debug(f"configuration={configuration}") 6686 6687 # Replace prioritizations from dict 6688 for config in config_dict: 6689 configuration[config] = config_dict[config] 6690 6691 # Replace prioritizations from file 6692 config_file = full_path(config_file) 6693 if config_file: 6694 if os.path.exists(config_file): 6695 with open(config_file) as config_file_content: 6696 config_file_dict = json.load(config_file_content) 6697 for config in config_file_dict: 6698 configuration[config] = config_file_dict[config] 6699 else: 6700 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6701 log.error(msg_error) 6702 raise ValueError(msg_error) 6703 6704 return configuration 6705 6706 def prioritization( 6707 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6708 ) -> bool: 6709 """ 6710 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 6711 prioritizes variants based on configured profiles and criteria. 6712 6713 :param table: The `table` parameter in the `prioritization` function is used to specify the name 6714 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 6715 a table name is provided, the method will prioritize the variants in that specific table 6716 :type table: str 6717 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 6718 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 6719 provided, the code will use a default prefix value of "PZ" 6720 :type pz_prefix: str 6721 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 6722 additional parameters specific to the prioritization process. These parameters can include 6723 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 6724 configurations needed for the prioritization of variants in a V 6725 :type pz_param: dict 6726 :return: A boolean value (True) is being returned from the `prioritization` function. 6727 """ 6728 6729 # Config 6730 config = self.get_config() 6731 6732 # Param 6733 param = self.get_param() 6734 6735 # Prioritization param 6736 if pz_param is not None: 6737 prioritization_param = pz_param 6738 else: 6739 prioritization_param = param.get("prioritization", {}) 6740 6741 # Configuration profiles 6742 prioritization_config_file = prioritization_param.get( 6743 "prioritization_config", None 6744 ) 6745 prioritization_config_file = full_path(prioritization_config_file) 6746 prioritizations_config = self.get_config_json( 6747 name="prioritizations", config_file=prioritization_config_file 6748 ) 6749 6750 # Prioritization prefix 6751 pz_prefix_default = "PZ" 6752 if pz_prefix is None: 6753 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 6754 6755 # Prioritization options 6756 profiles = prioritization_param.get("profiles", []) 6757 if isinstance(profiles, str): 6758 profiles = profiles.split(",") 6759 pzfields = prioritization_param.get( 6760 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 6761 ) 6762 if isinstance(pzfields, str): 6763 pzfields = pzfields.split(",") 6764 default_profile = prioritization_param.get("default_profile", None) 6765 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 6766 prioritization_score_mode = prioritization_param.get( 6767 "prioritization_score_mode", "HOWARD" 6768 ) 6769 6770 # Quick Prioritizations 6771 prioritizations = param.get("prioritizations", None) 6772 if prioritizations: 6773 log.info("Quick Prioritization:") 6774 for profile in prioritizations.split(","): 6775 if profile not in profiles: 6776 profiles.append(profile) 6777 log.info(f" {profile}") 6778 6779 # If profile "ALL" provided, all profiles in the config profiles 6780 if "ALL" in profiles: 6781 profiles = list(prioritizations_config.keys()) 6782 6783 for profile in profiles: 6784 if prioritizations_config.get(profile, None): 6785 log.debug(f"Profile '{profile}' configured") 6786 else: 6787 msg_error = f"Profile '{profile}' NOT configured" 6788 log.error(msg_error) 6789 raise ValueError(msg_error) 6790 6791 if profiles: 6792 log.info(f"Prioritization... ") 6793 else: 6794 log.debug(f"No profile defined") 6795 return False 6796 6797 if not default_profile and len(profiles): 6798 default_profile = profiles[0] 6799 6800 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6801 log.debug("Profiles to check: " + str(list(profiles))) 6802 6803 # Variables 6804 if table is not None: 6805 table_variants = table 6806 else: 6807 table_variants = self.get_table_variants(clause="update") 6808 log.debug(f"Table to prioritize: {table_variants}") 6809 6810 # Added columns 6811 added_columns = [] 6812 6813 # Create list of PZfields 6814 # List of PZFields 6815 list_of_pzfields_original = pzfields + [ 6816 pzfield + pzfields_sep + profile 6817 for pzfield in pzfields 6818 for profile in profiles 6819 ] 6820 list_of_pzfields = [] 6821 log.debug(f"{list_of_pzfields_original}") 6822 6823 # Remove existing PZfields to use if exists 6824 for pzfield in list_of_pzfields_original: 6825 if self.get_header().infos.get(pzfield, None) is None: 6826 list_of_pzfields.append(pzfield) 6827 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6828 else: 6829 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6830 6831 if list_of_pzfields: 6832 6833 # Explode Infos prefix 6834 explode_infos_prefix = self.get_explode_infos_prefix() 6835 6836 # PZfields tags description 6837 PZfields_INFOS = { 6838 f"{pz_prefix}Tags": { 6839 "ID": f"{pz_prefix}Tags", 6840 "Number": ".", 6841 "Type": "String", 6842 "Description": "Variant tags based on annotation criteria", 6843 }, 6844 f"{pz_prefix}Score": { 6845 "ID": f"{pz_prefix}Score", 6846 "Number": 1, 6847 "Type": "Integer", 6848 "Description": "Variant score based on annotation criteria", 6849 }, 6850 f"{pz_prefix}Flag": { 6851 "ID": f"{pz_prefix}Flag", 6852 "Number": 1, 6853 "Type": "String", 6854 "Description": "Variant flag based on annotation criteria", 6855 }, 6856 f"{pz_prefix}Comment": { 6857 "ID": f"{pz_prefix}Comment", 6858 "Number": ".", 6859 "Type": "String", 6860 "Description": "Variant comment based on annotation criteria", 6861 }, 6862 f"{pz_prefix}Infos": { 6863 "ID": f"{pz_prefix}Infos", 6864 "Number": ".", 6865 "Type": "String", 6866 "Description": "Variant infos based on annotation criteria", 6867 }, 6868 f"{pz_prefix}Class": { 6869 "ID": f"{pz_prefix}Class", 6870 "Number": ".", 6871 "Type": "String", 6872 "Description": "Variant class based on annotation criteria", 6873 }, 6874 } 6875 6876 # Create INFO fields if not exist 6877 for field in PZfields_INFOS: 6878 field_ID = PZfields_INFOS[field]["ID"] 6879 field_description = PZfields_INFOS[field]["Description"] 6880 if field_ID not in self.get_header().infos and field_ID in pzfields: 6881 field_description = ( 6882 PZfields_INFOS[field]["Description"] 6883 + f", profile {default_profile}" 6884 ) 6885 self.get_header().infos[field_ID] = vcf.parser._Info( 6886 field_ID, 6887 PZfields_INFOS[field]["Number"], 6888 PZfields_INFOS[field]["Type"], 6889 field_description, 6890 "unknown", 6891 "unknown", 6892 code_type_map[PZfields_INFOS[field]["Type"]], 6893 ) 6894 6895 # Create INFO fields if not exist for each profile 6896 for profile in prioritizations_config: 6897 if profile in profiles or profiles == []: 6898 for field in PZfields_INFOS: 6899 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6900 field_description = ( 6901 PZfields_INFOS[field]["Description"] 6902 + f", profile {profile}" 6903 ) 6904 if ( 6905 field_ID not in self.get_header().infos 6906 and field in pzfields 6907 ): 6908 self.get_header().infos[field_ID] = vcf.parser._Info( 6909 field_ID, 6910 PZfields_INFOS[field]["Number"], 6911 PZfields_INFOS[field]["Type"], 6912 field_description, 6913 "unknown", 6914 "unknown", 6915 code_type_map[PZfields_INFOS[field]["Type"]], 6916 ) 6917 6918 # Header 6919 for pzfield in list_of_pzfields: 6920 if re.match(f"{pz_prefix}Score.*", pzfield): 6921 added_column = self.add_column( 6922 table_name=table_variants, 6923 column_name=pzfield, 6924 column_type="INTEGER", 6925 default_value="0", 6926 ) 6927 elif re.match(f"{pz_prefix}Flag.*", pzfield): 6928 added_column = self.add_column( 6929 table_name=table_variants, 6930 column_name=pzfield, 6931 column_type="BOOLEAN", 6932 default_value="1", 6933 ) 6934 elif re.match(f"{pz_prefix}Class.*", pzfield): 6935 added_column = self.add_column( 6936 table_name=table_variants, 6937 column_name=pzfield, 6938 column_type="VARCHAR[]", 6939 default_value="null", 6940 ) 6941 else: 6942 added_column = self.add_column( 6943 table_name=table_variants, 6944 column_name=pzfield, 6945 column_type="STRING", 6946 default_value="''", 6947 ) 6948 added_columns.append(added_column) 6949 6950 # Profiles 6951 if profiles: 6952 6953 # foreach profile in configuration file 6954 for profile in prioritizations_config: 6955 6956 # If profile is asked in param, or ALL are asked (empty profile []) 6957 if profile in profiles or profiles == []: 6958 log.info(f"Profile '{profile}'") 6959 6960 sql_set_info_option = "" 6961 6962 sql_set_info = [] 6963 6964 # PZ fields set 6965 6966 # PZScore 6967 if ( 6968 f"{pz_prefix}Score{pzfields_sep}{profile}" 6969 in list_of_pzfields 6970 ): 6971 sql_set_info.append( 6972 f""" 6973 concat( 6974 '{pz_prefix}Score{pzfields_sep}{profile}=', 6975 {pz_prefix}Score{pzfields_sep}{profile} 6976 ) 6977 """ 6978 ) 6979 if ( 6980 profile == default_profile 6981 and f"{pz_prefix}Score" in list_of_pzfields 6982 ): 6983 sql_set_info.append( 6984 f""" 6985 concat( 6986 '{pz_prefix}Score=', 6987 {pz_prefix}Score{pzfields_sep}{profile} 6988 ) 6989 """ 6990 ) 6991 6992 # PZFlag 6993 if ( 6994 f"{pz_prefix}Flag{pzfields_sep}{profile}" 6995 in list_of_pzfields 6996 ): 6997 sql_set_info.append( 6998 f""" 6999 concat( 7000 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7001 CASE 7002 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7003 THEN 'PASS' 7004 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7005 THEN 'FILTERED' 7006 END 7007 ) 7008 """ 7009 ) 7010 if ( 7011 profile == default_profile 7012 and f"{pz_prefix}Flag" in list_of_pzfields 7013 ): 7014 sql_set_info.append( 7015 f""" 7016 concat( 7017 '{pz_prefix}Flag=', 7018 CASE 7019 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7020 THEN 'PASS' 7021 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7022 THEN 'FILTERED' 7023 END 7024 ) 7025 """ 7026 ) 7027 7028 # PZClass 7029 if ( 7030 f"{pz_prefix}Class{pzfields_sep}{profile}" 7031 in list_of_pzfields 7032 ): 7033 sql_set_info.append( 7034 f""" 7035 concat( 7036 '{pz_prefix}Class{pzfields_sep}{profile}=', 7037 CASE 7038 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7039 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7040 ELSE '.' 7041 END 7042 ) 7043 7044 """ 7045 ) 7046 if ( 7047 profile == default_profile 7048 and f"{pz_prefix}Class" in list_of_pzfields 7049 ): 7050 sql_set_info.append( 7051 f""" 7052 concat( 7053 '{pz_prefix}Class=', 7054 CASE 7055 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7056 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7057 ELSE '.' 7058 END 7059 ) 7060 """ 7061 ) 7062 7063 # PZComment 7064 if ( 7065 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7066 in list_of_pzfields 7067 ): 7068 sql_set_info.append( 7069 f""" 7070 CASE 7071 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7072 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7073 ELSE '' 7074 END 7075 """ 7076 ) 7077 if ( 7078 profile == default_profile 7079 and f"{pz_prefix}Comment" in list_of_pzfields 7080 ): 7081 sql_set_info.append( 7082 f""" 7083 CASE 7084 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7085 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7086 ELSE '' 7087 END 7088 """ 7089 ) 7090 7091 # PZInfos 7092 if ( 7093 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7094 in list_of_pzfields 7095 ): 7096 sql_set_info.append( 7097 f""" 7098 CASE 7099 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7100 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7101 ELSE '' 7102 END 7103 """ 7104 ) 7105 if ( 7106 profile == default_profile 7107 and f"{pz_prefix}Infos" in list_of_pzfields 7108 ): 7109 sql_set_info.append( 7110 f""" 7111 CASE 7112 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7113 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7114 ELSE '' 7115 END 7116 """ 7117 ) 7118 7119 # Merge PZfields 7120 sql_set_info_option = "" 7121 sql_set_sep = "" 7122 for sql_set in sql_set_info: 7123 if sql_set_sep: 7124 sql_set_info_option += f""" 7125 , concat('{sql_set_sep}', {sql_set}) 7126 """ 7127 else: 7128 sql_set_info_option += f""" 7129 , {sql_set} 7130 """ 7131 sql_set_sep = ";" 7132 7133 sql_queries = [] 7134 for annotation in prioritizations_config[profile]: 7135 7136 # skip special sections 7137 if annotation.startswith("_"): 7138 continue 7139 7140 # For each criterions 7141 for criterion in prioritizations_config[profile][ 7142 annotation 7143 ]: 7144 7145 # Criterion mode 7146 criterion_mode = None 7147 if np.any( 7148 np.isin(list(criterion.keys()), ["type", "value"]) 7149 ): 7150 criterion_mode = "operation" 7151 elif np.any( 7152 np.isin(list(criterion.keys()), ["sql", "fields"]) 7153 ): 7154 criterion_mode = "sql" 7155 log.debug(f"Criterion Mode: {criterion_mode}") 7156 7157 # Criterion parameters 7158 criterion_type = criterion.get("type", None) 7159 criterion_value = criterion.get("value", None) 7160 criterion_sql = criterion.get("sql", None) 7161 criterion_fields = criterion.get("fields", None) 7162 criterion_score = criterion.get("score", 0) 7163 criterion_flag = criterion.get("flag", "PASS") 7164 criterion_class = criterion.get("class", None) 7165 criterion_flag_bool = criterion_flag == "PASS" 7166 criterion_comment = ( 7167 ", ".join(criterion.get("comment", [])) 7168 .replace("'", "''") 7169 .replace(";", ",") 7170 .replace("\t", " ") 7171 ) 7172 criterion_infos = ( 7173 str(criterion) 7174 .replace("'", "''") 7175 .replace(";", ",") 7176 .replace("\t", " ") 7177 ) 7178 7179 # SQL 7180 if criterion_sql is not None and isinstance( 7181 criterion_sql, list 7182 ): 7183 criterion_sql = " ".join(criterion_sql) 7184 7185 # Fields and explode 7186 if criterion_fields is None: 7187 criterion_fields = [annotation] 7188 if not isinstance(criterion_fields, list): 7189 criterion_fields = str(criterion_fields).split(",") 7190 7191 # Class 7192 if criterion_class is not None and not isinstance( 7193 criterion_class, list 7194 ): 7195 criterion_class = str(criterion_class).split(",") 7196 7197 for annotation_field in criterion_fields: 7198 7199 # Explode specific annotation 7200 log.debug( 7201 f"Explode annotation '{annotation_field}'" 7202 ) 7203 added_columns += self.explode_infos( 7204 prefix=explode_infos_prefix, 7205 fields=[annotation_field], 7206 table=table_variants, 7207 ) 7208 extra_infos = self.get_extra_infos( 7209 table=table_variants 7210 ) 7211 7212 # Check if annotation field is present 7213 if ( 7214 f"{explode_infos_prefix}{annotation_field}" 7215 not in extra_infos 7216 ): 7217 msq_err = f"Annotation '{annotation_field}' not in data" 7218 log.error(msq_err) 7219 raise ValueError(msq_err) 7220 else: 7221 log.debug( 7222 f"Annotation '{annotation_field}' in data" 7223 ) 7224 7225 sql_set = [] 7226 sql_set_info = [] 7227 7228 # PZ fields set 7229 7230 # PZScore 7231 if ( 7232 f"{pz_prefix}Score{pzfields_sep}{profile}" 7233 in list_of_pzfields 7234 ): 7235 # if prioritization_score_mode == "HOWARD": 7236 # sql_set.append( 7237 # f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7238 # ) 7239 # VaRank prioritization score mode 7240 if prioritization_score_mode == "VaRank": 7241 sql_set.append( 7242 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7243 ) 7244 # default HOWARD prioritization score mode 7245 else: 7246 sql_set.append( 7247 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7248 ) 7249 7250 # PZFlag 7251 if ( 7252 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7253 in list_of_pzfields 7254 ): 7255 sql_set.append( 7256 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7257 ) 7258 7259 # PZClass 7260 if ( 7261 f"{pz_prefix}Class{pzfields_sep}{profile}" 7262 in list_of_pzfields 7263 and criterion_class is not None 7264 ): 7265 sql_set.append( 7266 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7267 ) 7268 7269 # PZComment 7270 if ( 7271 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7272 in list_of_pzfields 7273 ): 7274 sql_set.append( 7275 f""" 7276 {pz_prefix}Comment{pzfields_sep}{profile} = 7277 concat( 7278 {pz_prefix}Comment{pzfields_sep}{profile}, 7279 CASE 7280 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7281 THEN ', ' 7282 ELSE '' 7283 END, 7284 '{criterion_comment}' 7285 ) 7286 """ 7287 ) 7288 7289 # PZInfos 7290 if ( 7291 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7292 in list_of_pzfields 7293 ): 7294 sql_set.append( 7295 f""" 7296 {pz_prefix}Infos{pzfields_sep}{profile} = 7297 concat( 7298 {pz_prefix}Infos{pzfields_sep}{profile}, 7299 '{criterion_infos}' 7300 ) 7301 """ 7302 ) 7303 sql_set_option = ",".join(sql_set) 7304 7305 # Criterion and comparison 7306 if sql_set_option: 7307 7308 if criterion_mode in ["operation"]: 7309 7310 try: 7311 float(criterion_value) 7312 sql_update = f""" 7313 UPDATE {table_variants} 7314 SET {sql_set_option} 7315 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7316 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7317 """ 7318 except: 7319 contains_option = "" 7320 if criterion_type == "contains": 7321 contains_option = ".*" 7322 sql_update = f""" 7323 UPDATE {table_variants} 7324 SET {sql_set_option} 7325 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7326 """ 7327 sql_queries.append(sql_update) 7328 7329 elif criterion_mode in ["sql"]: 7330 7331 sql_update = f""" 7332 UPDATE {table_variants} 7333 SET {sql_set_option} 7334 WHERE {criterion_sql} 7335 """ 7336 sql_queries.append(sql_update) 7337 7338 else: 7339 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7340 log.error(msg_err) 7341 raise ValueError(msg_err) 7342 7343 else: 7344 log.warning( 7345 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7346 ) 7347 7348 # PZTags 7349 if ( 7350 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7351 in list_of_pzfields 7352 ): 7353 7354 # Create PZFalgs value 7355 pztags_value = "" 7356 pztags_sep_default = "," 7357 pztags_sep = "" 7358 for pzfield in pzfields: 7359 if pzfield not in [f"{pz_prefix}Tags"]: 7360 if ( 7361 f"{pzfield}{pzfields_sep}{profile}" 7362 in list_of_pzfields 7363 ): 7364 if pzfield in [f"{pz_prefix}Flag"]: 7365 pztags_value += f"""{pztags_sep}{pzfield}#', 7366 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7367 THEN 'PASS' 7368 ELSE 'FILTERED' 7369 END, '""" 7370 elif pzfield in [f"{pz_prefix}Class"]: 7371 pztags_value += f"""{pztags_sep}{pzfield}#', 7372 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7373 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7374 ELSE '.' 7375 END, '""" 7376 else: 7377 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7378 pztags_sep = pztags_sep_default 7379 7380 # Add Query update for PZFlags 7381 sql_update_pztags = f""" 7382 UPDATE {table_variants} 7383 SET INFO = concat( 7384 INFO, 7385 CASE WHEN INFO NOT in ('','.') 7386 THEN ';' 7387 ELSE '' 7388 END, 7389 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7390 ) 7391 """ 7392 sql_queries.append(sql_update_pztags) 7393 7394 # Add Query update for PZFlags for default 7395 if profile == default_profile: 7396 sql_update_pztags_default = f""" 7397 UPDATE {table_variants} 7398 SET INFO = concat( 7399 INFO, 7400 ';', 7401 '{pz_prefix}Tags={pztags_value}' 7402 ) 7403 """ 7404 sql_queries.append(sql_update_pztags_default) 7405 7406 log.info(f"""Profile '{profile}' - Prioritization... """) 7407 7408 if sql_queries: 7409 7410 for sql_query in sql_queries: 7411 log.debug( 7412 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7413 ) 7414 self.conn.execute(sql_query) 7415 7416 log.info(f"""Profile '{profile}' - Update... """) 7417 sql_query_update = f""" 7418 UPDATE {table_variants} 7419 SET INFO = 7420 concat( 7421 CASE 7422 WHEN INFO NOT IN ('','.') 7423 THEN concat(INFO, ';') 7424 ELSE '' 7425 END 7426 {sql_set_info_option} 7427 ) 7428 """ 7429 self.conn.execute(sql_query_update) 7430 7431 else: 7432 7433 log.warning(f"No profiles in parameters") 7434 7435 # Remove added columns 7436 for added_column in added_columns: 7437 self.drop_column(column=added_column) 7438 7439 # Explode INFOS fields into table fields 7440 if self.get_explode_infos(): 7441 self.explode_infos( 7442 prefix=self.get_explode_infos_prefix(), 7443 fields=self.get_explode_infos_fields(), 7444 force=True, 7445 ) 7446 7447 return True 7448 7449 ### 7450 # HGVS 7451 ### 7452 7453 def annotation_hgvs(self, threads: int = None) -> None: 7454 """ 7455 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7456 coordinates and alleles. 7457 7458 :param threads: The `threads` parameter is an optional integer that specifies the number of 7459 threads to use for parallel processing. If no value is provided, it will default to the number 7460 of threads obtained from the `get_threads()` method 7461 :type threads: int 7462 """ 7463 7464 # Function for each partition of the Dask Dataframe 7465 def partition_function(partition): 7466 """ 7467 The function `partition_function` applies the `annotation_hgvs_partition` function to 7468 each row of a DataFrame called `partition`. 7469 7470 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7471 to be processed 7472 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7473 the "partition" dataframe along the axis 1. 7474 """ 7475 return partition.apply(annotation_hgvs_partition, axis=1) 7476 7477 def annotation_hgvs_partition(row) -> str: 7478 """ 7479 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7480 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7481 7482 :param row: A dictionary-like object that contains the values for the following keys: 7483 :return: a string that contains the HGVS names associated with the given row of data. 7484 """ 7485 7486 chr = row["CHROM"] 7487 pos = row["POS"] 7488 ref = row["REF"] 7489 alt = row["ALT"] 7490 7491 # Find list of associated transcripts 7492 transcripts_list = list( 7493 polars_conn.execute( 7494 f""" 7495 SELECT transcript 7496 FROM refseq_df 7497 WHERE CHROM='{chr}' 7498 AND POS={pos} 7499 """ 7500 )["transcript"] 7501 ) 7502 7503 # Full HGVS annotation in list 7504 hgvs_full_list = [] 7505 7506 for transcript_name in transcripts_list: 7507 7508 # Transcript 7509 transcript = get_transcript( 7510 transcripts=transcripts, transcript_name=transcript_name 7511 ) 7512 # Exon 7513 if use_exon: 7514 exon = transcript.find_exon_number(pos) 7515 else: 7516 exon = None 7517 # Protein 7518 transcript_protein = None 7519 if use_protein or add_protein or full_format: 7520 transcripts_protein = list( 7521 polars_conn.execute( 7522 f""" 7523 SELECT protein 7524 FROM refseqlink_df 7525 WHERE transcript='{transcript_name}' 7526 LIMIT 1 7527 """ 7528 )["protein"] 7529 ) 7530 if len(transcripts_protein): 7531 transcript_protein = transcripts_protein[0] 7532 7533 # HGVS name 7534 hgvs_name = format_hgvs_name( 7535 chr, 7536 pos, 7537 ref, 7538 alt, 7539 genome=genome, 7540 transcript=transcript, 7541 transcript_protein=transcript_protein, 7542 exon=exon, 7543 use_gene=use_gene, 7544 use_protein=use_protein, 7545 full_format=full_format, 7546 use_version=use_version, 7547 codon_type=codon_type, 7548 ) 7549 hgvs_full_list.append(hgvs_name) 7550 if add_protein and not use_protein and not full_format: 7551 hgvs_name = format_hgvs_name( 7552 chr, 7553 pos, 7554 ref, 7555 alt, 7556 genome=genome, 7557 transcript=transcript, 7558 transcript_protein=transcript_protein, 7559 exon=exon, 7560 use_gene=use_gene, 7561 use_protein=True, 7562 full_format=False, 7563 use_version=use_version, 7564 codon_type=codon_type, 7565 ) 7566 hgvs_full_list.append(hgvs_name) 7567 7568 # Create liste of HGVS annotations 7569 hgvs_full = ",".join(hgvs_full_list) 7570 7571 return hgvs_full 7572 7573 # Polars connexion 7574 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7575 7576 # Config 7577 config = self.get_config() 7578 7579 # Databases 7580 # Genome 7581 databases_genomes_folders = ( 7582 config.get("folders", {}) 7583 .get("databases", {}) 7584 .get("genomes", DEFAULT_GENOME_FOLDER) 7585 ) 7586 databases_genome = ( 7587 config.get("folders", {}).get("databases", {}).get("genomes", "") 7588 ) 7589 # refseq database folder 7590 databases_refseq_folders = ( 7591 config.get("folders", {}) 7592 .get("databases", {}) 7593 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7594 ) 7595 # refseq 7596 databases_refseq = config.get("databases", {}).get("refSeq", None) 7597 # refSeqLink 7598 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7599 7600 # Param 7601 param = self.get_param() 7602 7603 # Quick HGVS 7604 if "hgvs_options" in param and param.get("hgvs_options", ""): 7605 log.info(f"Quick HGVS Annotation:") 7606 if not param.get("hgvs", None): 7607 param["hgvs"] = {} 7608 for option in param.get("hgvs_options", "").split(","): 7609 option_var_val = option.split("=") 7610 option_var = option_var_val[0] 7611 if len(option_var_val) > 1: 7612 option_val = option_var_val[1] 7613 else: 7614 option_val = "True" 7615 if option_val.upper() in ["TRUE"]: 7616 option_val = True 7617 elif option_val.upper() in ["FALSE"]: 7618 option_val = False 7619 log.info(f" {option_var}={option_val}") 7620 param["hgvs"][option_var] = option_val 7621 7622 # Check if HGVS annotation enabled 7623 if "hgvs" in param: 7624 log.info(f"HGVS Annotation... ") 7625 for hgvs_option in param.get("hgvs", {}): 7626 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7627 else: 7628 return 7629 7630 # HGVS Param 7631 param_hgvs = param.get("hgvs", {}) 7632 use_exon = param_hgvs.get("use_exon", False) 7633 use_gene = param_hgvs.get("use_gene", False) 7634 use_protein = param_hgvs.get("use_protein", False) 7635 add_protein = param_hgvs.get("add_protein", False) 7636 full_format = param_hgvs.get("full_format", False) 7637 use_version = param_hgvs.get("use_version", False) 7638 codon_type = param_hgvs.get("codon_type", "3") 7639 7640 # refSseq refSeqLink 7641 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7642 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7643 7644 # Assembly 7645 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7646 7647 # Genome 7648 genome_file = None 7649 if find_genome(databases_genome): 7650 genome_file = find_genome(databases_genome) 7651 else: 7652 genome_file = find_genome( 7653 genome_path=databases_genomes_folders, assembly=assembly 7654 ) 7655 log.debug("Genome: " + str(genome_file)) 7656 7657 # refSseq 7658 refseq_file = find_file_prefix( 7659 input_file=databases_refseq, 7660 prefix="ncbiRefSeq", 7661 folder=databases_refseq_folders, 7662 assembly=assembly, 7663 ) 7664 log.debug("refSeq: " + str(refseq_file)) 7665 7666 # refSeqLink 7667 refseqlink_file = find_file_prefix( 7668 input_file=databases_refseqlink, 7669 prefix="ncbiRefSeqLink", 7670 folder=databases_refseq_folders, 7671 assembly=assembly, 7672 ) 7673 log.debug("refSeqLink: " + str(refseqlink_file)) 7674 7675 # Threads 7676 if not threads: 7677 threads = self.get_threads() 7678 log.debug("Threads: " + str(threads)) 7679 7680 # Variables 7681 table_variants = self.get_table_variants(clause="update") 7682 7683 # Get variants SNV and InDel only 7684 query_variants = f""" 7685 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7686 FROM {table_variants} 7687 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7688 """ 7689 df_variants = self.get_query_to_df(query_variants) 7690 7691 # Added columns 7692 added_columns = [] 7693 7694 # Add hgvs column in variants table 7695 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7696 added_column = self.add_column( 7697 table_variants, hgvs_column_name, "STRING", default_value=None 7698 ) 7699 added_columns.append(added_column) 7700 7701 log.debug(f"refSeq loading...") 7702 # refSeq in duckDB 7703 refseq_table = get_refseq_table( 7704 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7705 ) 7706 # Loading all refSeq in Dataframe 7707 refseq_query = f""" 7708 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7709 FROM {refseq_table} 7710 JOIN df_variants ON ( 7711 {refseq_table}.chrom = df_variants.CHROM 7712 AND {refseq_table}.txStart<=df_variants.POS 7713 AND {refseq_table}.txEnd>=df_variants.POS 7714 ) 7715 """ 7716 refseq_df = self.conn.query(refseq_query).pl() 7717 7718 if refseqlink_file: 7719 log.debug(f"refSeqLink loading...") 7720 # refSeqLink in duckDB 7721 refseqlink_table = get_refseq_table( 7722 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7723 ) 7724 # Loading all refSeqLink in Dataframe 7725 protacc_column = "protAcc_with_ver" 7726 mrnaacc_column = "mrnaAcc_with_ver" 7727 refseqlink_query = f""" 7728 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7729 FROM {refseqlink_table} 7730 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7731 WHERE protAcc_without_ver IS NOT NULL 7732 """ 7733 # Polars Dataframe 7734 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7735 7736 # Read RefSeq transcripts into a python dict/model. 7737 log.debug(f"Transcripts loading...") 7738 with tempfile.TemporaryDirectory() as tmpdir: 7739 transcripts_query = f""" 7740 COPY ( 7741 SELECT {refseq_table}.* 7742 FROM {refseq_table} 7743 JOIN df_variants ON ( 7744 {refseq_table}.chrom=df_variants.CHROM 7745 AND {refseq_table}.txStart<=df_variants.POS 7746 AND {refseq_table}.txEnd>=df_variants.POS 7747 ) 7748 ) 7749 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7750 """ 7751 self.conn.query(transcripts_query) 7752 with open(f"{tmpdir}/transcript.tsv") as infile: 7753 transcripts = read_transcripts(infile) 7754 7755 # Polars connexion 7756 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7757 7758 log.debug("Genome loading...") 7759 # Read genome sequence using pyfaidx. 7760 genome = Fasta(genome_file) 7761 7762 log.debug("Start annotation HGVS...") 7763 7764 # Create 7765 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7766 ddf = dd.from_pandas(df_variants, npartitions=threads) 7767 7768 # Use dask.dataframe.apply() to apply function on each partition 7769 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7770 7771 # Convert Dask DataFrame to Pandas Dataframe 7772 df = ddf.compute() 7773 7774 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7775 with tempfile.TemporaryDirectory() as tmpdir: 7776 df_parquet = os.path.join(tmpdir, "df.parquet") 7777 df.to_parquet(df_parquet) 7778 7779 # Update hgvs column 7780 update_variant_query = f""" 7781 UPDATE {table_variants} 7782 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7783 FROM read_parquet('{df_parquet}') as df 7784 WHERE variants."#CHROM" = df.CHROM 7785 AND variants.POS = df.POS 7786 AND variants.REF = df.REF 7787 AND variants.ALT = df.ALT 7788 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7789 """ 7790 self.execute_query(update_variant_query) 7791 7792 # Update INFO column 7793 sql_query_update = f""" 7794 UPDATE {table_variants} 7795 SET INFO = 7796 concat( 7797 CASE 7798 WHEN INFO NOT IN ('','.') 7799 THEN concat(INFO, ';') 7800 ELSE '' 7801 END, 7802 'hgvs=', 7803 {hgvs_column_name} 7804 ) 7805 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7806 """ 7807 self.execute_query(sql_query_update) 7808 7809 # Add header 7810 HGVS_INFOS = { 7811 "hgvs": { 7812 "ID": "hgvs", 7813 "Number": ".", 7814 "Type": "String", 7815 "Description": f"HGVS annotatation with HOWARD", 7816 } 7817 } 7818 7819 for field in HGVS_INFOS: 7820 field_ID = HGVS_INFOS[field]["ID"] 7821 field_description = HGVS_INFOS[field]["Description"] 7822 self.get_header().infos[field_ID] = vcf.parser._Info( 7823 field_ID, 7824 HGVS_INFOS[field]["Number"], 7825 HGVS_INFOS[field]["Type"], 7826 field_description, 7827 "unknown", 7828 "unknown", 7829 code_type_map[HGVS_INFOS[field]["Type"]], 7830 ) 7831 7832 # Remove added columns 7833 for added_column in added_columns: 7834 self.drop_column(column=added_column) 7835 7836 ### 7837 # Calculation 7838 ### 7839 7840 def get_operations_help( 7841 self, operations_config_dict: dict = {}, operations_config_file: str = None 7842 ) -> list: 7843 7844 # Init 7845 operations_help = [] 7846 7847 # operations 7848 operations = self.get_config_json( 7849 name="calculations", 7850 config_dict=operations_config_dict, 7851 config_file=operations_config_file, 7852 ) 7853 for op in operations: 7854 op_name = operations[op].get("name", op).upper() 7855 op_description = operations[op].get("description", op_name) 7856 op_available = operations[op].get("available", False) 7857 if op_available: 7858 operations_help.append(f" {op_name}: {op_description}") 7859 7860 # Sort operations 7861 operations_help.sort() 7862 7863 # insert header 7864 operations_help.insert(0, "Available calculation operations:") 7865 7866 # Return 7867 return operations_help 7868 7869 def calculation( 7870 self, 7871 operations: dict = {}, 7872 operations_config_dict: dict = {}, 7873 operations_config_file: str = None, 7874 ) -> None: 7875 """ 7876 It takes a list of operations, and for each operation, it checks if it's a python or sql 7877 operation, and then calls the appropriate function 7878 7879 param json example: 7880 "calculation": { 7881 "NOMEN": { 7882 "options": { 7883 "hgvs_field": "hgvs" 7884 }, 7885 "middle" : null 7886 } 7887 """ 7888 7889 # Param 7890 param = self.get_param() 7891 7892 # operations config 7893 operations_config = self.get_config_json( 7894 name="calculations", 7895 config_dict=operations_config_dict, 7896 config_file=operations_config_file, 7897 ) 7898 7899 # Upper keys 7900 operations_config = {k.upper(): v for k, v in operations_config.items()} 7901 7902 # Calculations 7903 7904 # Operations from param 7905 operations = param.get("calculation", {}).get("calculations", operations) 7906 7907 # Quick calculation - add 7908 if param.get("calculations", None): 7909 calculations_list = [ 7910 value for value in param.get("calculations", "").split(",") 7911 ] 7912 log.info(f"Quick Calculations:") 7913 for calculation_key in calculations_list: 7914 log.info(f" {calculation_key}") 7915 for calculation_operation in calculations_list: 7916 if calculation_operation.upper() not in operations: 7917 operations[calculation_operation.upper()] = {} 7918 add_value_into_dict( 7919 dict_tree=param, 7920 sections=[ 7921 "calculation", 7922 "calculations", 7923 calculation_operation.upper(), 7924 ], 7925 value={}, 7926 ) 7927 7928 # Operations for calculation 7929 if not operations: 7930 operations = param.get("calculation", {}).get("calculations", {}) 7931 7932 if operations: 7933 log.info(f"Calculations...") 7934 7935 # For each operations 7936 for operation_name in operations: 7937 operation_name = operation_name.upper() 7938 if operation_name not in [""]: 7939 if operation_name in operations_config: 7940 log.info(f"Calculation '{operation_name}'") 7941 operation = operations_config[operation_name] 7942 operation_type = operation.get("type", "sql") 7943 if operation_type == "python": 7944 self.calculation_process_function( 7945 operation=operation, operation_name=operation_name 7946 ) 7947 elif operation_type == "sql": 7948 self.calculation_process_sql( 7949 operation=operation, operation_name=operation_name 7950 ) 7951 else: 7952 log.error( 7953 f"Operations config: Type '{operation_type}' NOT available" 7954 ) 7955 raise ValueError( 7956 f"Operations config: Type '{operation_type}' NOT available" 7957 ) 7958 else: 7959 log.error( 7960 f"Operations config: Calculation '{operation_name}' NOT available" 7961 ) 7962 raise ValueError( 7963 f"Operations config: Calculation '{operation_name}' NOT available" 7964 ) 7965 7966 # Explode INFOS fields into table fields 7967 if self.get_explode_infos(): 7968 self.explode_infos( 7969 prefix=self.get_explode_infos_prefix(), 7970 fields=self.get_explode_infos_fields(), 7971 force=True, 7972 ) 7973 7974 def calculation_process_sql( 7975 self, operation: dict, operation_name: str = "unknown" 7976 ) -> None: 7977 """ 7978 The `calculation_process_sql` function takes in a mathematical operation as a string and 7979 performs the operation, updating the specified table with the result. 7980 7981 :param operation: The `operation` parameter is a dictionary that contains information about the 7982 mathematical operation to be performed. It includes the following keys: 7983 :type operation: dict 7984 :param operation_name: The `operation_name` parameter is a string that represents the name of 7985 the mathematical operation being performed. It is used for logging and error handling purposes, 7986 defaults to unknown 7987 :type operation_name: str (optional) 7988 """ 7989 7990 # table variants 7991 table_variants = self.get_table_variants(clause="alter") 7992 7993 # Operation infos 7994 operation_name = operation.get("name", "unknown") 7995 log.debug(f"process sql {operation_name}") 7996 output_column_name = operation.get("output_column_name", operation_name) 7997 output_column_type = operation.get("output_column_type", "String") 7998 prefix = operation.get("explode_infos_prefix", "") 7999 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8000 output_column_description = operation.get( 8001 "output_column_description", f"{operation_name} operation" 8002 ) 8003 operation_query = operation.get("operation_query", None) 8004 if isinstance(operation_query, list): 8005 operation_query = " ".join(operation_query) 8006 operation_info_fields = operation.get("info_fields", []) 8007 operation_info_fields_check = operation.get("info_fields_check", False) 8008 operation_info = operation.get("operation_info", True) 8009 8010 if operation_query: 8011 8012 # Info fields check 8013 operation_info_fields_check_result = True 8014 if operation_info_fields_check: 8015 header_infos = self.get_header().infos 8016 for info_field in operation_info_fields: 8017 operation_info_fields_check_result = ( 8018 operation_info_fields_check_result 8019 and info_field in header_infos 8020 ) 8021 8022 # If info fields available 8023 if operation_info_fields_check_result: 8024 8025 # Added_columns 8026 added_columns = [] 8027 8028 # Create VCF header field 8029 vcf_reader = self.get_header() 8030 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8031 output_column_name, 8032 ".", 8033 output_column_type, 8034 output_column_description, 8035 "howard calculation", 8036 "0", 8037 self.code_type_map.get(output_column_type), 8038 ) 8039 8040 # Explode infos if needed 8041 log.debug(f"calculation_process_sql prefix {prefix}") 8042 added_columns += self.explode_infos( 8043 prefix=prefix, 8044 fields=[output_column_name] + operation_info_fields, 8045 force=True, 8046 ) 8047 8048 # Create column 8049 added_column = self.add_column( 8050 table_name=table_variants, 8051 column_name=prefix + output_column_name, 8052 column_type=output_column_type_sql, 8053 default_value="null", 8054 ) 8055 added_columns.append(added_column) 8056 8057 # Operation calculation 8058 try: 8059 8060 # Query to update calculation column 8061 sql_update = f""" 8062 UPDATE {table_variants} 8063 SET "{prefix}{output_column_name}" = ({operation_query}) 8064 """ 8065 self.conn.execute(sql_update) 8066 8067 # Add to INFO 8068 if operation_info: 8069 sql_update_info = f""" 8070 UPDATE {table_variants} 8071 SET "INFO" = 8072 concat( 8073 CASE 8074 WHEN "INFO" IS NOT NULL 8075 THEN concat("INFO", ';') 8076 ELSE '' 8077 END, 8078 '{output_column_name}=', 8079 "{prefix}{output_column_name}" 8080 ) 8081 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8082 """ 8083 self.conn.execute(sql_update_info) 8084 8085 except: 8086 log.error( 8087 f"Operations config: Calculation '{operation_name}' query failed" 8088 ) 8089 raise ValueError( 8090 f"Operations config: Calculation '{operation_name}' query failed" 8091 ) 8092 8093 # Remove added columns 8094 for added_column in added_columns: 8095 log.debug(f"added_column: {added_column}") 8096 self.drop_column(column=added_column) 8097 8098 else: 8099 log.error( 8100 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8101 ) 8102 raise ValueError( 8103 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8104 ) 8105 8106 else: 8107 log.error( 8108 f"Operations config: Calculation '{operation_name}' query NOT defined" 8109 ) 8110 raise ValueError( 8111 f"Operations config: Calculation '{operation_name}' query NOT defined" 8112 ) 8113 8114 def calculation_process_function( 8115 self, operation: dict, operation_name: str = "unknown" 8116 ) -> None: 8117 """ 8118 The `calculation_process_function` takes in an operation dictionary and performs the specified 8119 function with the given parameters. 8120 8121 :param operation: The `operation` parameter is a dictionary that contains information about the 8122 operation to be performed. It has the following keys: 8123 :type operation: dict 8124 :param operation_name: The `operation_name` parameter is a string that represents the name of 8125 the operation being performed. It is used for logging purposes, defaults to unknown 8126 :type operation_name: str (optional) 8127 """ 8128 8129 operation_name = operation["name"] 8130 log.debug(f"process sql {operation_name}") 8131 function_name = operation["function_name"] 8132 function_params = operation["function_params"] 8133 getattr(self, function_name)(*function_params) 8134 8135 def calculation_variant_id(self) -> None: 8136 """ 8137 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8138 updates the INFO field of a variants table with the variant ID. 8139 """ 8140 8141 # variant_id annotation field 8142 variant_id_tag = self.get_variant_id_column() 8143 added_columns = [variant_id_tag] 8144 8145 # variant_id hgvs tags" 8146 vcf_infos_tags = { 8147 variant_id_tag: "howard variant ID annotation", 8148 } 8149 8150 # Variants table 8151 table_variants = self.get_table_variants() 8152 8153 # Header 8154 vcf_reader = self.get_header() 8155 8156 # Add variant_id to header 8157 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8158 variant_id_tag, 8159 ".", 8160 "String", 8161 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8162 "howard calculation", 8163 "0", 8164 self.code_type_map.get("String"), 8165 ) 8166 8167 # Update 8168 sql_update = f""" 8169 UPDATE {table_variants} 8170 SET "INFO" = 8171 concat( 8172 CASE 8173 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8174 THEN '' 8175 ELSE concat("INFO", ';') 8176 END, 8177 '{variant_id_tag}=', 8178 "{variant_id_tag}" 8179 ) 8180 """ 8181 self.conn.execute(sql_update) 8182 8183 # Remove added columns 8184 for added_column in added_columns: 8185 self.drop_column(column=added_column) 8186 8187 def calculation_extract_snpeff_hgvs( 8188 self, 8189 snpeff_hgvs: str = "snpeff_hgvs", 8190 snpeff_field: str = "ANN", 8191 ) -> None: 8192 """ 8193 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8194 annotation field in a VCF file and adds them as a new column in the variants table. 8195 8196 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8197 function is used to specify the name of the column that will store the HGVS nomenclatures 8198 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8199 snpeff_hgvs 8200 :type snpeff_hgvs: str (optional) 8201 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8202 function represents the field in the VCF file that contains SnpEff annotations. This field is 8203 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8204 to ANN 8205 :type snpeff_field: str (optional) 8206 """ 8207 8208 # Snpeff hgvs tags 8209 vcf_infos_tags = { 8210 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8211 } 8212 8213 # Prefix 8214 prefix = self.get_explode_infos_prefix() 8215 if prefix: 8216 prefix = "INFO/" 8217 8218 # snpEff fields 8219 speff_ann_infos = prefix + snpeff_field 8220 speff_hgvs_infos = prefix + snpeff_hgvs 8221 8222 # Variants table 8223 table_variants = self.get_table_variants() 8224 8225 # Header 8226 vcf_reader = self.get_header() 8227 8228 # Add columns 8229 added_columns = [] 8230 8231 # Explode HGVS field in column 8232 added_columns += self.explode_infos(fields=[snpeff_field]) 8233 8234 if snpeff_field in vcf_reader.infos: 8235 8236 log.debug(vcf_reader.infos[snpeff_field]) 8237 8238 # Extract ANN header 8239 ann_description = vcf_reader.infos[snpeff_field].desc 8240 pattern = r"'(.+?)'" 8241 match = re.search(pattern, ann_description) 8242 if match: 8243 ann_header_match = match.group(1).split(" | ") 8244 ann_header_desc = {} 8245 for i in range(len(ann_header_match)): 8246 ann_header_info = "".join( 8247 char for char in ann_header_match[i] if char.isalnum() 8248 ) 8249 ann_header_desc[ann_header_info] = ann_header_match[i] 8250 if not ann_header_desc: 8251 raise ValueError("Invalid header description format") 8252 else: 8253 raise ValueError("Invalid header description format") 8254 8255 # Create variant id 8256 variant_id_column = self.get_variant_id_column() 8257 added_columns += [variant_id_column] 8258 8259 # Create dataframe 8260 dataframe_snpeff_hgvs = self.get_query_to_df( 8261 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8262 ) 8263 8264 # Create main NOMEN column 8265 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8266 speff_ann_infos 8267 ].apply( 8268 lambda x: extract_snpeff_hgvs( 8269 str(x), header=list(ann_header_desc.values()) 8270 ) 8271 ) 8272 8273 # Add snpeff_hgvs to header 8274 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8275 snpeff_hgvs, 8276 ".", 8277 "String", 8278 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8279 "howard calculation", 8280 "0", 8281 self.code_type_map.get("String"), 8282 ) 8283 8284 # Update 8285 sql_update = f""" 8286 UPDATE variants 8287 SET "INFO" = 8288 concat( 8289 CASE 8290 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8291 THEN '' 8292 ELSE concat("INFO", ';') 8293 END, 8294 CASE 8295 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8296 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8297 THEN concat( 8298 '{snpeff_hgvs}=', 8299 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8300 ) 8301 ELSE '' 8302 END 8303 ) 8304 FROM dataframe_snpeff_hgvs 8305 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8306 8307 """ 8308 self.conn.execute(sql_update) 8309 8310 # Delete dataframe 8311 del dataframe_snpeff_hgvs 8312 gc.collect() 8313 8314 else: 8315 8316 log.warning( 8317 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8318 ) 8319 8320 # Remove added columns 8321 for added_column in added_columns: 8322 self.drop_column(column=added_column) 8323 8324 def calculation_snpeff_ann_explode( 8325 self, 8326 uniquify: bool = True, 8327 output_format: str = "fields", 8328 output_prefix: str = "snpeff_", 8329 snpeff_field: str = "ANN", 8330 ) -> None: 8331 """ 8332 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8333 exploding the HGVS field and updating variant information accordingly. 8334 8335 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8336 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8337 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8338 defaults to True 8339 :type uniquify: bool (optional) 8340 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8341 function specifies the format in which the output annotations will be generated. It has a 8342 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8343 format, defaults to fields 8344 :type output_format: str (optional) 8345 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8346 method is used to specify the prefix that will be added to the output annotations generated 8347 during the calculation process. This prefix helps to differentiate the newly added annotations 8348 from existing ones in the output data. By default, the, defaults to ANN_ 8349 :type output_prefix: str (optional) 8350 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8351 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8352 field will be processed to explode the HGVS annotations and update the variant information 8353 accordingly, defaults to ANN 8354 :type snpeff_field: str (optional) 8355 """ 8356 8357 # SnpEff annotation field 8358 snpeff_hgvs = "snpeff_ann_explode" 8359 8360 # Snpeff hgvs tags 8361 vcf_infos_tags = { 8362 snpeff_hgvs: "Explode snpEff annotations", 8363 } 8364 8365 # Prefix 8366 prefix = self.get_explode_infos_prefix() 8367 if prefix: 8368 prefix = "INFO/" 8369 8370 # snpEff fields 8371 speff_ann_infos = prefix + snpeff_field 8372 speff_hgvs_infos = prefix + snpeff_hgvs 8373 8374 # Variants table 8375 table_variants = self.get_table_variants() 8376 8377 # Header 8378 vcf_reader = self.get_header() 8379 8380 # Add columns 8381 added_columns = [] 8382 8383 # Explode HGVS field in column 8384 added_columns += self.explode_infos(fields=[snpeff_field]) 8385 log.debug(f"snpeff_field={snpeff_field}") 8386 log.debug(f"added_columns={added_columns}") 8387 8388 if snpeff_field in vcf_reader.infos: 8389 8390 # Extract ANN header 8391 ann_description = vcf_reader.infos[snpeff_field].desc 8392 pattern = r"'(.+?)'" 8393 match = re.search(pattern, ann_description) 8394 if match: 8395 ann_header_match = match.group(1).split(" | ") 8396 ann_header = [] 8397 ann_header_desc = {} 8398 for i in range(len(ann_header_match)): 8399 ann_header_info = "".join( 8400 char for char in ann_header_match[i] if char.isalnum() 8401 ) 8402 ann_header.append(ann_header_info) 8403 ann_header_desc[ann_header_info] = ann_header_match[i] 8404 if not ann_header_desc: 8405 raise ValueError("Invalid header description format") 8406 else: 8407 raise ValueError("Invalid header description format") 8408 8409 # Create variant id 8410 variant_id_column = self.get_variant_id_column() 8411 added_columns += [variant_id_column] 8412 8413 # Create dataframe 8414 dataframe_snpeff_hgvs = self.get_query_to_df( 8415 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8416 ) 8417 8418 # Create snpEff columns 8419 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8420 speff_ann_infos 8421 ].apply( 8422 lambda x: explode_snpeff_ann( 8423 str(x), 8424 uniquify=uniquify, 8425 output_format=output_format, 8426 prefix=output_prefix, 8427 header=list(ann_header_desc.values()), 8428 ) 8429 ) 8430 8431 # Header 8432 ann_annotations_prefix = "" 8433 if output_format.upper() in ["JSON"]: 8434 ann_annotations_prefix = f"{output_prefix}=" 8435 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8436 output_prefix, 8437 ".", 8438 "String", 8439 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8440 + " - JSON format", 8441 "howard calculation", 8442 "0", 8443 self.code_type_map.get("String"), 8444 ) 8445 else: 8446 for ann_annotation in ann_header: 8447 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8448 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8449 ann_annotation_id, 8450 ".", 8451 "String", 8452 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8453 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8454 "howard calculation", 8455 "0", 8456 self.code_type_map.get("String"), 8457 ) 8458 8459 # Update 8460 sql_update = f""" 8461 UPDATE variants 8462 SET "INFO" = 8463 concat( 8464 CASE 8465 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8466 THEN '' 8467 ELSE concat("INFO", ';') 8468 END, 8469 CASE 8470 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8471 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8472 THEN concat( 8473 '{ann_annotations_prefix}', 8474 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8475 ) 8476 ELSE '' 8477 END 8478 ) 8479 FROM dataframe_snpeff_hgvs 8480 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8481 8482 """ 8483 self.conn.execute(sql_update) 8484 8485 # Delete dataframe 8486 del dataframe_snpeff_hgvs 8487 gc.collect() 8488 8489 else: 8490 8491 log.warning( 8492 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8493 ) 8494 8495 # Remove added columns 8496 for added_column in added_columns: 8497 self.drop_column(column=added_column) 8498 8499 def calculation_extract_nomen(self) -> None: 8500 """ 8501 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8502 """ 8503 8504 # NOMEN field 8505 field_nomen_dict = "NOMEN_DICT" 8506 8507 # NOMEN structure 8508 nomen_dict = { 8509 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8510 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8511 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8512 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8513 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8514 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8515 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8516 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8517 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8518 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8519 } 8520 8521 # Param 8522 param = self.get_param() 8523 8524 # Prefix 8525 prefix = self.get_explode_infos_prefix() 8526 8527 # Header 8528 vcf_reader = self.get_header() 8529 8530 # Get HGVS field 8531 hgvs_field = ( 8532 param.get("calculation", {}) 8533 .get("calculations", {}) 8534 .get("NOMEN", {}) 8535 .get("options", {}) 8536 .get("hgvs_field", "hgvs") 8537 ) 8538 8539 # Get transcripts 8540 transcripts_file = ( 8541 param.get("calculation", {}) 8542 .get("calculations", {}) 8543 .get("NOMEN", {}) 8544 .get("options", {}) 8545 .get("transcripts", None) 8546 ) 8547 transcripts_file = full_path(transcripts_file) 8548 transcripts = [] 8549 if transcripts_file: 8550 if os.path.exists(transcripts_file): 8551 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8552 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8553 else: 8554 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8555 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8556 8557 # Added columns 8558 added_columns = [] 8559 8560 # Explode HGVS field in column 8561 added_columns += self.explode_infos(fields=[hgvs_field]) 8562 8563 # extra infos 8564 extra_infos = self.get_extra_infos() 8565 extra_field = prefix + hgvs_field 8566 8567 if extra_field in extra_infos: 8568 8569 # Create dataframe 8570 dataframe_hgvs = self.get_query_to_df( 8571 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8572 ) 8573 8574 # Create main NOMEN column 8575 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8576 lambda x: find_nomen(str(x), transcripts=transcripts) 8577 ) 8578 8579 # Explode NOMEN Structure and create SQL set for update 8580 sql_nomen_fields = [] 8581 for nomen_field in nomen_dict: 8582 8583 # Explode each field into a column 8584 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8585 lambda x: dict(x).get(nomen_field, "") 8586 ) 8587 8588 # Create VCF header field 8589 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8590 nomen_field, 8591 ".", 8592 "String", 8593 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8594 "howard calculation", 8595 "0", 8596 self.code_type_map.get("String"), 8597 ) 8598 sql_nomen_fields.append( 8599 f""" 8600 CASE 8601 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8602 THEN concat( 8603 ';{nomen_field}=', 8604 dataframe_hgvs."{nomen_field}" 8605 ) 8606 ELSE '' 8607 END 8608 """ 8609 ) 8610 8611 # SQL set for update 8612 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8613 8614 # Update 8615 sql_update = f""" 8616 UPDATE variants 8617 SET "INFO" = 8618 concat( 8619 CASE 8620 WHEN "INFO" IS NULL 8621 THEN '' 8622 ELSE "INFO" 8623 END, 8624 {sql_nomen_fields_set} 8625 ) 8626 FROM dataframe_hgvs 8627 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8628 AND variants."POS" = dataframe_hgvs."POS" 8629 AND variants."REF" = dataframe_hgvs."REF" 8630 AND variants."ALT" = dataframe_hgvs."ALT" 8631 """ 8632 self.conn.execute(sql_update) 8633 8634 # Delete dataframe 8635 del dataframe_hgvs 8636 gc.collect() 8637 8638 # Remove added columns 8639 for added_column in added_columns: 8640 self.drop_column(column=added_column) 8641 8642 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8643 """ 8644 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8645 pipeline/sample for a variant and updates the variant information in a VCF file. 8646 8647 :param tag: The `tag` parameter is a string that represents the annotation field for the 8648 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8649 VCF header and to update the corresponding field in the variants table, defaults to 8650 findbypipeline 8651 :type tag: str (optional) 8652 """ 8653 8654 # if FORMAT and samples 8655 if ( 8656 "FORMAT" in self.get_header_columns_as_list() 8657 and self.get_header_sample_list() 8658 ): 8659 8660 # findbypipeline annotation field 8661 findbypipeline_tag = tag 8662 8663 # VCF infos tags 8664 vcf_infos_tags = { 8665 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8666 } 8667 8668 # Prefix 8669 prefix = self.get_explode_infos_prefix() 8670 8671 # Field 8672 findbypipeline_infos = prefix + findbypipeline_tag 8673 8674 # Variants table 8675 table_variants = self.get_table_variants() 8676 8677 # Header 8678 vcf_reader = self.get_header() 8679 8680 # Create variant id 8681 variant_id_column = self.get_variant_id_column() 8682 added_columns = [variant_id_column] 8683 8684 # variant_id, FORMAT and samples 8685 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8686 self.get_header_sample_list() 8687 ) 8688 8689 # Create dataframe 8690 dataframe_findbypipeline = self.get_query_to_df( 8691 f""" SELECT {samples_fields} FROM {table_variants} """ 8692 ) 8693 8694 # Create findbypipeline column 8695 dataframe_findbypipeline[findbypipeline_infos] = ( 8696 dataframe_findbypipeline.apply( 8697 lambda row: findbypipeline( 8698 row, samples=self.get_header_sample_list() 8699 ), 8700 axis=1, 8701 ) 8702 ) 8703 8704 # Add snpeff_hgvs to header 8705 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8706 findbypipeline_tag, 8707 ".", 8708 "String", 8709 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8710 "howard calculation", 8711 "0", 8712 self.code_type_map.get("String"), 8713 ) 8714 8715 # Update 8716 sql_update = f""" 8717 UPDATE variants 8718 SET "INFO" = 8719 concat( 8720 CASE 8721 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8722 THEN '' 8723 ELSE concat("INFO", ';') 8724 END, 8725 CASE 8726 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8727 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8728 THEN concat( 8729 '{findbypipeline_tag}=', 8730 dataframe_findbypipeline."{findbypipeline_infos}" 8731 ) 8732 ELSE '' 8733 END 8734 ) 8735 FROM dataframe_findbypipeline 8736 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8737 """ 8738 self.conn.execute(sql_update) 8739 8740 # Remove added columns 8741 for added_column in added_columns: 8742 self.drop_column(column=added_column) 8743 8744 # Delete dataframe 8745 del dataframe_findbypipeline 8746 gc.collect() 8747 8748 def calculation_genotype_concordance(self) -> None: 8749 """ 8750 The function `calculation_genotype_concordance` calculates the genotype concordance for 8751 multi-caller VCF files and updates the variant information in the database. 8752 """ 8753 8754 # if FORMAT and samples 8755 if ( 8756 "FORMAT" in self.get_header_columns_as_list() 8757 and self.get_header_sample_list() 8758 ): 8759 8760 # genotypeconcordance annotation field 8761 genotypeconcordance_tag = "genotypeconcordance" 8762 8763 # VCF infos tags 8764 vcf_infos_tags = { 8765 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8766 } 8767 8768 # Prefix 8769 prefix = self.get_explode_infos_prefix() 8770 8771 # Field 8772 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8773 8774 # Variants table 8775 table_variants = self.get_table_variants() 8776 8777 # Header 8778 vcf_reader = self.get_header() 8779 8780 # Create variant id 8781 variant_id_column = self.get_variant_id_column() 8782 added_columns = [variant_id_column] 8783 8784 # variant_id, FORMAT and samples 8785 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8786 self.get_header_sample_list() 8787 ) 8788 8789 # Create dataframe 8790 dataframe_genotypeconcordance = self.get_query_to_df( 8791 f""" SELECT {samples_fields} FROM {table_variants} """ 8792 ) 8793 8794 # Create genotypeconcordance column 8795 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8796 dataframe_genotypeconcordance.apply( 8797 lambda row: genotypeconcordance( 8798 row, samples=self.get_header_sample_list() 8799 ), 8800 axis=1, 8801 ) 8802 ) 8803 8804 # Add genotypeconcordance to header 8805 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8806 genotypeconcordance_tag, 8807 ".", 8808 "String", 8809 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8810 "howard calculation", 8811 "0", 8812 self.code_type_map.get("String"), 8813 ) 8814 8815 # Update 8816 sql_update = f""" 8817 UPDATE variants 8818 SET "INFO" = 8819 concat( 8820 CASE 8821 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8822 THEN '' 8823 ELSE concat("INFO", ';') 8824 END, 8825 CASE 8826 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8827 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8828 THEN concat( 8829 '{genotypeconcordance_tag}=', 8830 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8831 ) 8832 ELSE '' 8833 END 8834 ) 8835 FROM dataframe_genotypeconcordance 8836 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8837 """ 8838 self.conn.execute(sql_update) 8839 8840 # Remove added columns 8841 for added_column in added_columns: 8842 self.drop_column(column=added_column) 8843 8844 # Delete dataframe 8845 del dataframe_genotypeconcordance 8846 gc.collect() 8847 8848 def calculation_barcode(self, tag: str = "barcode") -> None: 8849 """ 8850 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8851 updates the INFO field in the file with the calculated barcode values. 8852 8853 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8854 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8855 the default tag name is set to "barcode", defaults to barcode 8856 :type tag: str (optional) 8857 """ 8858 8859 # if FORMAT and samples 8860 if ( 8861 "FORMAT" in self.get_header_columns_as_list() 8862 and self.get_header_sample_list() 8863 ): 8864 8865 # barcode annotation field 8866 if not tag: 8867 tag = "barcode" 8868 8869 # VCF infos tags 8870 vcf_infos_tags = { 8871 tag: "barcode calculation (VaRank)", 8872 } 8873 8874 # Prefix 8875 prefix = self.get_explode_infos_prefix() 8876 8877 # Field 8878 barcode_infos = prefix + tag 8879 8880 # Variants table 8881 table_variants = self.get_table_variants() 8882 8883 # Header 8884 vcf_reader = self.get_header() 8885 8886 # Create variant id 8887 variant_id_column = self.get_variant_id_column() 8888 added_columns = [variant_id_column] 8889 8890 # variant_id, FORMAT and samples 8891 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8892 self.get_header_sample_list() 8893 ) 8894 8895 # Create dataframe 8896 dataframe_barcode = self.get_query_to_df( 8897 f""" SELECT {samples_fields} FROM {table_variants} """ 8898 ) 8899 8900 # Create barcode column 8901 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8902 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8903 ) 8904 8905 # Add barcode to header 8906 vcf_reader.infos[tag] = vcf.parser._Info( 8907 tag, 8908 ".", 8909 "String", 8910 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8911 "howard calculation", 8912 "0", 8913 self.code_type_map.get("String"), 8914 ) 8915 8916 # Update 8917 sql_update = f""" 8918 UPDATE {table_variants} 8919 SET "INFO" = 8920 concat( 8921 CASE 8922 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8923 THEN '' 8924 ELSE concat("INFO", ';') 8925 END, 8926 CASE 8927 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8928 AND dataframe_barcode."{barcode_infos}" NOT NULL 8929 THEN concat( 8930 '{tag}=', 8931 dataframe_barcode."{barcode_infos}" 8932 ) 8933 ELSE '' 8934 END 8935 ) 8936 FROM dataframe_barcode 8937 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8938 """ 8939 self.conn.execute(sql_update) 8940 8941 # Remove added columns 8942 for added_column in added_columns: 8943 self.drop_column(column=added_column) 8944 8945 # Delete dataframe 8946 del dataframe_barcode 8947 gc.collect() 8948 8949 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8950 """ 8951 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8952 and updates the INFO field in the file with the calculated barcode values. 8953 8954 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8955 the barcode tag that will be added to the VCF file during the calculation process. If no value 8956 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8957 :type tag: str (optional) 8958 """ 8959 8960 # if FORMAT and samples 8961 if ( 8962 "FORMAT" in self.get_header_columns_as_list() 8963 and self.get_header_sample_list() 8964 ): 8965 8966 # barcode annotation field 8967 if not tag: 8968 tag = "BCF" 8969 8970 # VCF infos tags 8971 vcf_infos_tags = { 8972 tag: "barcode family calculation", 8973 f"{tag}S": "barcode family samples", 8974 } 8975 8976 # Param 8977 param = self.get_param() 8978 log.debug(f"param={param}") 8979 8980 # Prefix 8981 prefix = self.get_explode_infos_prefix() 8982 8983 # PED param 8984 ped = ( 8985 param.get("calculation", {}) 8986 .get("calculations", {}) 8987 .get("BARCODEFAMILY", {}) 8988 .get("family_pedigree", None) 8989 ) 8990 log.debug(f"ped={ped}") 8991 8992 # Load PED 8993 if ped: 8994 8995 # Pedigree is a file 8996 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8997 log.debug("Pedigree is file") 8998 with open(full_path(ped)) as ped: 8999 ped = json.load(ped) 9000 9001 # Pedigree is a string 9002 elif isinstance(ped, str): 9003 log.debug("Pedigree is str") 9004 try: 9005 ped = json.loads(ped) 9006 log.debug("Pedigree is json str") 9007 except ValueError as e: 9008 ped_samples = ped.split(",") 9009 ped = {} 9010 for ped_sample in ped_samples: 9011 ped[ped_sample] = ped_sample 9012 9013 # Pedigree is a dict 9014 elif isinstance(ped, dict): 9015 log.debug("Pedigree is dict") 9016 9017 # Pedigree is not well formatted 9018 else: 9019 msg_error = "Pedigree not well formatted" 9020 log.error(msg_error) 9021 raise ValueError(msg_error) 9022 9023 # Construct list 9024 ped_samples = list(ped.values()) 9025 9026 else: 9027 log.debug("Pedigree not defined. Take all samples") 9028 ped_samples = self.get_header_sample_list() 9029 ped = {} 9030 for ped_sample in ped_samples: 9031 ped[ped_sample] = ped_sample 9032 9033 # Check pedigree 9034 if not ped or len(ped) == 0: 9035 msg_error = f"Error in pedigree: samples {ped_samples}" 9036 log.error(msg_error) 9037 raise ValueError(msg_error) 9038 9039 # Log 9040 log.info( 9041 "Calculation 'BARCODEFAMILY' - Samples: " 9042 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9043 ) 9044 log.debug(f"ped_samples={ped_samples}") 9045 9046 # Field 9047 barcode_infos = prefix + tag 9048 9049 # Variants table 9050 table_variants = self.get_table_variants() 9051 9052 # Header 9053 vcf_reader = self.get_header() 9054 9055 # Create variant id 9056 variant_id_column = self.get_variant_id_column() 9057 added_columns = [variant_id_column] 9058 9059 # variant_id, FORMAT and samples 9060 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9061 ped_samples 9062 ) 9063 9064 # Create dataframe 9065 dataframe_barcode = self.get_query_to_df( 9066 f""" SELECT {samples_fields} FROM {table_variants} """ 9067 ) 9068 9069 # Create barcode column 9070 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9071 lambda row: barcode(row, samples=ped_samples), axis=1 9072 ) 9073 9074 # Add barcode family to header 9075 # Add vaf_normalization to header 9076 vcf_reader.formats[tag] = vcf.parser._Format( 9077 id=tag, 9078 num=".", 9079 type="String", 9080 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9081 type_code=self.code_type_map.get("String"), 9082 ) 9083 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9084 id=f"{tag}S", 9085 num=".", 9086 type="String", 9087 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9088 type_code=self.code_type_map.get("String"), 9089 ) 9090 9091 # Update 9092 # for sample in ped_samples: 9093 sql_update_set = [] 9094 for sample in self.get_header_sample_list() + ["FORMAT"]: 9095 if sample in ped_samples: 9096 value = f'dataframe_barcode."{barcode_infos}"' 9097 value_samples = "'" + ",".join(ped_samples) + "'" 9098 elif sample == "FORMAT": 9099 value = f"'{tag}'" 9100 value_samples = f"'{tag}S'" 9101 else: 9102 value = "'.'" 9103 value_samples = "'.'" 9104 format_regex = r"[a-zA-Z0-9\s]" 9105 sql_update_set.append( 9106 f""" 9107 "{sample}" = 9108 concat( 9109 CASE 9110 WHEN {table_variants}."{sample}" = './.' 9111 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9112 ELSE {table_variants}."{sample}" 9113 END, 9114 ':', 9115 {value}, 9116 ':', 9117 {value_samples} 9118 ) 9119 """ 9120 ) 9121 9122 sql_update_set_join = ", ".join(sql_update_set) 9123 sql_update = f""" 9124 UPDATE {table_variants} 9125 SET {sql_update_set_join} 9126 FROM dataframe_barcode 9127 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9128 """ 9129 self.conn.execute(sql_update) 9130 9131 # Remove added columns 9132 for added_column in added_columns: 9133 self.drop_column(column=added_column) 9134 9135 # Delete dataframe 9136 del dataframe_barcode 9137 gc.collect() 9138 9139 def calculation_trio(self) -> None: 9140 """ 9141 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9142 information to the INFO field of each variant. 9143 """ 9144 9145 # if FORMAT and samples 9146 if ( 9147 "FORMAT" in self.get_header_columns_as_list() 9148 and self.get_header_sample_list() 9149 ): 9150 9151 # trio annotation field 9152 trio_tag = "trio" 9153 9154 # VCF infos tags 9155 vcf_infos_tags = { 9156 "trio": "trio calculation", 9157 } 9158 9159 # Param 9160 param = self.get_param() 9161 9162 # Prefix 9163 prefix = self.get_explode_infos_prefix() 9164 9165 # Trio param 9166 trio_ped = ( 9167 param.get("calculation", {}) 9168 .get("calculations", {}) 9169 .get("TRIO", {}) 9170 .get("trio_pedigree", None) 9171 ) 9172 9173 # Load trio 9174 if trio_ped: 9175 9176 # Trio pedigree is a file 9177 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9178 log.debug("TRIO pedigree is file") 9179 with open(full_path(trio_ped)) as trio_ped: 9180 trio_ped = json.load(trio_ped) 9181 9182 # Trio pedigree is a string 9183 elif isinstance(trio_ped, str): 9184 log.debug("TRIO pedigree is str") 9185 try: 9186 trio_ped = json.loads(trio_ped) 9187 log.debug("TRIO pedigree is json str") 9188 except ValueError as e: 9189 trio_samples = trio_ped.split(",") 9190 if len(trio_samples) == 3: 9191 trio_ped = { 9192 "father": trio_samples[0], 9193 "mother": trio_samples[1], 9194 "child": trio_samples[2], 9195 } 9196 log.debug("TRIO pedigree is list str") 9197 else: 9198 msg_error = "TRIO pedigree not well formatted" 9199 log.error(msg_error) 9200 raise ValueError(msg_error) 9201 9202 # Trio pedigree is a dict 9203 elif isinstance(trio_ped, dict): 9204 log.debug("TRIO pedigree is dict") 9205 9206 # Trio pedigree is not well formatted 9207 else: 9208 msg_error = "TRIO pedigree not well formatted" 9209 log.error(msg_error) 9210 raise ValueError(msg_error) 9211 9212 # Construct trio list 9213 trio_samples = [ 9214 trio_ped.get("father", ""), 9215 trio_ped.get("mother", ""), 9216 trio_ped.get("child", ""), 9217 ] 9218 9219 else: 9220 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9221 samples_list = self.get_header_sample_list() 9222 if len(samples_list) >= 3: 9223 trio_samples = self.get_header_sample_list()[0:3] 9224 trio_ped = { 9225 "father": trio_samples[0], 9226 "mother": trio_samples[1], 9227 "child": trio_samples[2], 9228 } 9229 else: 9230 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9231 log.error(msg_error) 9232 raise ValueError(msg_error) 9233 9234 # Check trio pedigree 9235 if not trio_ped or len(trio_ped) != 3: 9236 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9237 log.error(msg_error) 9238 raise ValueError(msg_error) 9239 9240 # Log 9241 log.info( 9242 f"Calculation 'TRIO' - Samples: " 9243 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9244 ) 9245 9246 # Field 9247 trio_infos = prefix + trio_tag 9248 9249 # Variants table 9250 table_variants = self.get_table_variants() 9251 9252 # Header 9253 vcf_reader = self.get_header() 9254 9255 # Create variant id 9256 variant_id_column = self.get_variant_id_column() 9257 added_columns = [variant_id_column] 9258 9259 # variant_id, FORMAT and samples 9260 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9261 self.get_header_sample_list() 9262 ) 9263 9264 # Create dataframe 9265 dataframe_trio = self.get_query_to_df( 9266 f""" SELECT {samples_fields} FROM {table_variants} """ 9267 ) 9268 9269 # Create trio column 9270 dataframe_trio[trio_infos] = dataframe_trio.apply( 9271 lambda row: trio(row, samples=trio_samples), axis=1 9272 ) 9273 9274 # Add trio to header 9275 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9276 trio_tag, 9277 ".", 9278 "String", 9279 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9280 "howard calculation", 9281 "0", 9282 self.code_type_map.get("String"), 9283 ) 9284 9285 # Update 9286 sql_update = f""" 9287 UPDATE {table_variants} 9288 SET "INFO" = 9289 concat( 9290 CASE 9291 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9292 THEN '' 9293 ELSE concat("INFO", ';') 9294 END, 9295 CASE 9296 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9297 AND dataframe_trio."{trio_infos}" NOT NULL 9298 THEN concat( 9299 '{trio_tag}=', 9300 dataframe_trio."{trio_infos}" 9301 ) 9302 ELSE '' 9303 END 9304 ) 9305 FROM dataframe_trio 9306 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9307 """ 9308 self.conn.execute(sql_update) 9309 9310 # Remove added columns 9311 for added_column in added_columns: 9312 self.drop_column(column=added_column) 9313 9314 # Delete dataframe 9315 del dataframe_trio 9316 gc.collect() 9317 9318 def calculation_vaf_normalization(self) -> None: 9319 """ 9320 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9321 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9322 :return: The function does not return anything. 9323 """ 9324 9325 # if FORMAT and samples 9326 if ( 9327 "FORMAT" in self.get_header_columns_as_list() 9328 and self.get_header_sample_list() 9329 ): 9330 9331 # vaf_normalization annotation field 9332 vaf_normalization_tag = "VAF" 9333 9334 # VCF infos tags 9335 vcf_infos_tags = { 9336 "VAF": "VAF Variant Frequency", 9337 } 9338 9339 # Prefix 9340 prefix = self.get_explode_infos_prefix() 9341 9342 # Variants table 9343 table_variants = self.get_table_variants() 9344 9345 # Header 9346 vcf_reader = self.get_header() 9347 9348 # Do not calculate if VAF already exists 9349 if "VAF" in vcf_reader.formats: 9350 log.debug("VAF already on genotypes") 9351 return 9352 9353 # Create variant id 9354 variant_id_column = self.get_variant_id_column() 9355 added_columns = [variant_id_column] 9356 9357 # variant_id, FORMAT and samples 9358 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9359 f""" "{sample}" """ for sample in self.get_header_sample_list() 9360 ) 9361 9362 # Create dataframe 9363 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9364 log.debug(f"query={query}") 9365 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9366 9367 vaf_normalization_set = [] 9368 9369 # for each sample vaf_normalization 9370 for sample in self.get_header_sample_list(): 9371 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9372 lambda row: vaf_normalization(row, sample=sample), axis=1 9373 ) 9374 vaf_normalization_set.append( 9375 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9376 ) 9377 9378 # Add VAF to FORMAT 9379 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9380 "FORMAT" 9381 ].apply(lambda x: str(x) + ":VAF") 9382 vaf_normalization_set.append( 9383 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9384 ) 9385 9386 # Add vaf_normalization to header 9387 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9388 id=vaf_normalization_tag, 9389 num="1", 9390 type="Float", 9391 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9392 type_code=self.code_type_map.get("Float"), 9393 ) 9394 9395 # Create fields to add in INFO 9396 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9397 9398 # Update 9399 sql_update = f""" 9400 UPDATE {table_variants} 9401 SET {sql_vaf_normalization_set} 9402 FROM dataframe_vaf_normalization 9403 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9404 9405 """ 9406 self.conn.execute(sql_update) 9407 9408 # Remove added columns 9409 for added_column in added_columns: 9410 self.drop_column(column=added_column) 9411 9412 # Delete dataframe 9413 del dataframe_vaf_normalization 9414 gc.collect() 9415 9416 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9417 """ 9418 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9419 field in a VCF file and updates the INFO column of the variants table with the calculated 9420 statistics. 9421 9422 :param info: The `info` parameter is a string that represents the type of information for which 9423 genotype statistics are calculated. It is used to generate various VCF info tags for the 9424 statistics, such as the number of occurrences, the list of values, the minimum value, the 9425 maximum value, the mean, the median, defaults to VAF 9426 :type info: str (optional) 9427 """ 9428 9429 # if FORMAT and samples 9430 if ( 9431 "FORMAT" in self.get_header_columns_as_list() 9432 and self.get_header_sample_list() 9433 ): 9434 9435 # vaf_stats annotation field 9436 vaf_stats_tag = info + "_stats" 9437 9438 # VCF infos tags 9439 vcf_infos_tags = { 9440 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9441 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9442 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9443 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9444 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9445 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9446 info 9447 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9448 } 9449 9450 # Prefix 9451 prefix = self.get_explode_infos_prefix() 9452 9453 # Field 9454 vaf_stats_infos = prefix + vaf_stats_tag 9455 9456 # Variants table 9457 table_variants = self.get_table_variants() 9458 9459 # Header 9460 vcf_reader = self.get_header() 9461 9462 # Create variant id 9463 variant_id_column = self.get_variant_id_column() 9464 added_columns = [variant_id_column] 9465 9466 # variant_id, FORMAT and samples 9467 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9468 self.get_header_sample_list() 9469 ) 9470 9471 # Create dataframe 9472 dataframe_vaf_stats = self.get_query_to_df( 9473 f""" SELECT {samples_fields} FROM {table_variants} """ 9474 ) 9475 9476 # Create vaf_stats column 9477 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9478 lambda row: genotype_stats( 9479 row, samples=self.get_header_sample_list(), info=info 9480 ), 9481 axis=1, 9482 ) 9483 9484 # List of vcf tags 9485 sql_vaf_stats_fields = [] 9486 9487 # Check all VAF stats infos 9488 for stat in vcf_infos_tags: 9489 9490 # Extract stats 9491 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9492 lambda x: dict(x).get(stat, "") 9493 ) 9494 9495 # Add snpeff_hgvs to header 9496 vcf_reader.infos[stat] = vcf.parser._Info( 9497 stat, 9498 ".", 9499 "String", 9500 vcf_infos_tags.get(stat, "genotype statistics"), 9501 "howard calculation", 9502 "0", 9503 self.code_type_map.get("String"), 9504 ) 9505 9506 if len(sql_vaf_stats_fields): 9507 sep = ";" 9508 else: 9509 sep = "" 9510 9511 # Create fields to add in INFO 9512 sql_vaf_stats_fields.append( 9513 f""" 9514 CASE 9515 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9516 THEN concat( 9517 '{sep}{stat}=', 9518 dataframe_vaf_stats."{stat}" 9519 ) 9520 ELSE '' 9521 END 9522 """ 9523 ) 9524 9525 # SQL set for update 9526 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9527 9528 # Update 9529 sql_update = f""" 9530 UPDATE {table_variants} 9531 SET "INFO" = 9532 concat( 9533 CASE 9534 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9535 THEN '' 9536 ELSE concat("INFO", ';') 9537 END, 9538 {sql_vaf_stats_fields_set} 9539 ) 9540 FROM dataframe_vaf_stats 9541 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9542 9543 """ 9544 self.conn.execute(sql_update) 9545 9546 # Remove added columns 9547 for added_column in added_columns: 9548 self.drop_column(column=added_column) 9549 9550 # Delete dataframe 9551 del dataframe_vaf_stats 9552 gc.collect() 9553 9554 def calculation_transcripts_annotation( 9555 self, info_json: str = None, info_format: str = None 9556 ) -> None: 9557 """ 9558 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9559 field to it if transcripts are available. 9560 9561 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9562 is a string parameter that represents the information field to be used in the transcripts JSON. 9563 It is used to specify the JSON format for the transcripts information. If no value is provided 9564 when calling the method, it defaults to " 9565 :type info_json: str 9566 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9567 method is a string parameter that specifies the format of the information field to be used in 9568 the transcripts JSON. It is used to define the format of the information field 9569 :type info_format: str 9570 """ 9571 9572 # Create transcripts table 9573 transcripts_table = self.create_transcript_view() 9574 9575 # Add info field 9576 if transcripts_table: 9577 self.transcript_view_to_variants( 9578 transcripts_table=transcripts_table, 9579 transcripts_info_field_json=info_json, 9580 transcripts_info_field_format=info_format, 9581 ) 9582 else: 9583 log.info("No Transcripts to process. Check param.json file configuration") 9584 9585 def calculation_transcripts_prioritization(self) -> None: 9586 """ 9587 The function `calculation_transcripts_prioritization` creates a transcripts table and 9588 prioritizes transcripts based on certain criteria. 9589 """ 9590 9591 # Create transcripts table 9592 transcripts_table = self.create_transcript_view() 9593 9594 # Add info field 9595 if transcripts_table: 9596 self.transcripts_prioritization(transcripts_table=transcripts_table) 9597 else: 9598 log.info("No Transcripts to process. Check param.json file configuration") 9599 9600 ############### 9601 # Transcripts # 9602 ############### 9603 9604 def transcripts_prioritization( 9605 self, transcripts_table: str = None, param: dict = {} 9606 ) -> bool: 9607 """ 9608 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 9609 and updates the variants table with the prioritized information. 9610 9611 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 9612 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 9613 This parameter is used to identify the table where the transcripts data is stored for the 9614 prioritization process 9615 :type transcripts_table: str 9616 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 9617 that contains various configuration settings for the prioritization process of transcripts. It 9618 is used to customize the behavior of the prioritization algorithm and includes settings such as 9619 the prefix for prioritization fields, default profiles, and other 9620 :type param: dict 9621 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 9622 transcripts prioritization process is successfully completed, and `False` if there are any 9623 issues or if no profile is defined for transcripts prioritization. 9624 """ 9625 9626 log.debug("Start transcripts prioritization...") 9627 9628 # Param 9629 if not param: 9630 param = self.get_param() 9631 9632 # Variants table 9633 table_variants = self.get_table_variants() 9634 log.debug(f"transcripts_table={transcripts_table}") 9635 # Transcripts table 9636 if transcripts_table is None: 9637 log.debug(f"transcripts_table={transcripts_table}") 9638 transcripts_table = self.create_transcript_view( 9639 transcripts_table="transcripts", param=param 9640 ) 9641 log.debug(f"transcripts_table={transcripts_table}") 9642 if transcripts_table is None: 9643 msg_err = "No Transcripts table availalble" 9644 log.error(msg_err) 9645 raise ValueError(msg_err) 9646 9647 # Get transcripts columns 9648 columns_as_list_query = f""" 9649 DESCRIBE {transcripts_table} 9650 """ 9651 columns_as_list = list( 9652 self.get_query_to_df(columns_as_list_query)["column_name"] 9653 ) 9654 9655 # Create INFO if not exists 9656 if "INFO" not in columns_as_list: 9657 query_add_info = f""" 9658 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 9659 """ 9660 self.execute_query(query_add_info) 9661 9662 # Prioritization param and Force only PZ Score and Flag 9663 pz_param = param.get("transcripts", {}).get("prioritization", {}) 9664 pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score" 9665 pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag" 9666 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 9667 pz_param["pzfields"] = [pz_fields_score, pz_fields_flag] 9668 pz_profile_default = ( 9669 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 9670 ) 9671 9672 # Exit if no profile 9673 if pz_profile_default is None: 9674 log.warning("No profile defined for transcripts prioritization") 9675 return False 9676 9677 # Prioritization 9678 prioritization_result = self.prioritization( 9679 table=transcripts_table, 9680 pz_param=param.get("transcripts", {}).get("prioritization", {}), 9681 ) 9682 if not prioritization_result: 9683 log.warning("Transcripts prioritization not processed") 9684 return False 9685 9686 # Explode PZ fields 9687 self.explode_infos( 9688 table=transcripts_table, 9689 fields=param.get("transcripts", {}) 9690 .get("prioritization", {}) 9691 .get("pzfields", []), 9692 ) 9693 9694 # Export Transcripts prioritization infos to variants table 9695 query_update = f""" 9696 WITH RankedTranscripts AS ( 9697 SELECT 9698 "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag}, 9699 ROW_NUMBER() OVER ( 9700 PARTITION BY "#CHROM", POS, REF, ALT 9701 ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC 9702 ) AS rn 9703 FROM 9704 {transcripts_table} 9705 ) 9706 UPDATE {table_variants} 9707 SET 9708 INFO = CONCAT(CASE 9709 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9710 THEN '' 9711 ELSE concat("INFO", ';') 9712 END, 9713 concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag}) 9714 ) 9715 FROM 9716 RankedTranscripts 9717 WHERE 9718 rn = 1 9719 AND variants."#CHROM" = RankedTranscripts."#CHROM" 9720 AND variants."POS" = RankedTranscripts."POS" 9721 AND variants."REF" = RankedTranscripts."REF" 9722 AND variants."ALT" = RankedTranscripts."ALT" 9723 9724 """ 9725 self.execute_query(query=query_update) 9726 9727 # Add PZ Transcript in header 9728 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 9729 pz_fields_transcripts, 9730 ".", 9731 "String", 9732 f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}", 9733 "unknown", 9734 "unknown", 9735 code_type_map["String"], 9736 ) 9737 9738 # Return 9739 return True 9740 9741 def create_transcript_view_from_columns_map( 9742 self, 9743 transcripts_table: str = "transcripts", 9744 columns_maps: dict = {}, 9745 added_columns: list = [], 9746 temporary_tables: list = None, 9747 annotation_fields: list = None, 9748 ) -> tuple[list, list, list]: 9749 """ 9750 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9751 specified columns mapping for transcripts data. 9752 9753 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9754 the table where the transcripts data is stored or will be stored in the database. This table 9755 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9756 predictions, etc. It defaults to "transcripts, defaults to transcripts 9757 :type transcripts_table: str (optional) 9758 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9759 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9760 represents a mapping configuration for a specific set of columns. It typically includes details such 9761 as the main transcript column and additional information columns 9762 :type columns_maps: dict 9763 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9764 function is a list that stores the additional columns that will be added to the view being created 9765 based on the columns map provided. These columns are generated by exploding the transcript 9766 information columns along with the main transcript column 9767 :type added_columns: list 9768 :param temporary_tables: The `temporary_tables` parameter in the 9769 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9770 tables created during the process of creating a transcript view from a columns map. These temporary 9771 tables are used to store intermediate results or transformations before the final view is generated 9772 :type temporary_tables: list 9773 :param annotation_fields: The `annotation_fields` parameter in the 9774 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9775 for annotation in the query view creation process. These fields are extracted from the 9776 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9777 :type annotation_fields: list 9778 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9779 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9780 """ 9781 9782 log.debug("Start transcrpts view creation from columns map...") 9783 9784 # "from_columns_map": [ 9785 # { 9786 # "transcripts_column": "Ensembl_transcriptid", 9787 # "transcripts_infos_columns": [ 9788 # "genename", 9789 # "Ensembl_geneid", 9790 # "LIST_S2_score", 9791 # "LIST_S2_pred", 9792 # ], 9793 # }, 9794 # { 9795 # "transcripts_column": "Ensembl_transcriptid", 9796 # "transcripts_infos_columns": [ 9797 # "genename", 9798 # "VARITY_R_score", 9799 # "Aloft_pred", 9800 # ], 9801 # }, 9802 # ], 9803 9804 # Init 9805 if temporary_tables is None: 9806 temporary_tables = [] 9807 if annotation_fields is None: 9808 annotation_fields = [] 9809 9810 # Variants table 9811 table_variants = self.get_table_variants() 9812 9813 for columns_map in columns_maps: 9814 9815 # Transcript column 9816 transcripts_column = columns_map.get("transcripts_column", None) 9817 9818 # Transcripts infos columns 9819 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9820 9821 if transcripts_column is not None: 9822 9823 # Explode 9824 added_columns += self.explode_infos( 9825 fields=[transcripts_column] + transcripts_infos_columns 9826 ) 9827 9828 # View clauses 9829 clause_select = [] 9830 for field in [transcripts_column] + transcripts_infos_columns: 9831 clause_select.append( 9832 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9833 ) 9834 if field not in [transcripts_column]: 9835 annotation_fields.append(field) 9836 9837 # Querey View 9838 query = f""" 9839 SELECT 9840 "#CHROM", POS, REF, ALT, INFO, 9841 "{transcripts_column}" AS 'transcript', 9842 {", ".join(clause_select)} 9843 FROM ( 9844 SELECT 9845 "#CHROM", POS, REF, ALT, INFO, 9846 {", ".join(clause_select)} 9847 FROM {table_variants} 9848 ) 9849 WHERE "{transcripts_column}" IS NOT NULL 9850 """ 9851 9852 # Create temporary table 9853 temporary_table = transcripts_table + "".join( 9854 random.choices(string.ascii_uppercase + string.digits, k=10) 9855 ) 9856 9857 # Temporary_tables 9858 temporary_tables.append(temporary_table) 9859 query_view = f""" 9860 CREATE TEMPORARY TABLE {temporary_table} 9861 AS ({query}) 9862 """ 9863 self.execute_query(query=query_view) 9864 9865 return added_columns, temporary_tables, annotation_fields 9866 9867 def create_transcript_view_from_column_format( 9868 self, 9869 transcripts_table: str = "transcripts", 9870 column_formats: dict = {}, 9871 temporary_tables: list = None, 9872 annotation_fields: list = None, 9873 ) -> tuple[list, list, list]: 9874 """ 9875 The `create_transcript_view_from_column_format` function generates a transcript view based on 9876 specified column formats, adds additional columns and annotation fields, and returns the list of 9877 temporary tables and annotation fields. 9878 9879 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9880 the table containing the transcripts data. This table will be used as the base table for creating 9881 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9882 different table name if needed, defaults to transcripts 9883 :type transcripts_table: str (optional) 9884 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9885 about the columns to be used for creating the transcript view. Each entry in the dictionary 9886 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9887 the provided code snippet: 9888 :type column_formats: dict 9889 :param temporary_tables: The `temporary_tables` parameter in the 9890 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9891 views created during the process of creating a transcript view from a column format. These temporary 9892 views are used to manipulate and extract data before generating the final transcript view. It 9893 :type temporary_tables: list 9894 :param annotation_fields: The `annotation_fields` parameter in the 9895 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9896 that are extracted from the temporary views created during the process. These annotation fields are 9897 obtained by querying the temporary views and extracting the column names excluding specific columns 9898 like `#CH 9899 :type annotation_fields: list 9900 :return: The `create_transcript_view_from_column_format` function returns two lists: 9901 `temporary_tables` and `annotation_fields`. 9902 """ 9903 9904 log.debug("Start transcrpts view creation from column format...") 9905 9906 # "from_column_format": [ 9907 # { 9908 # "transcripts_column": "ANN", 9909 # "transcripts_infos_column": "Feature_ID", 9910 # } 9911 # ], 9912 9913 # Init 9914 if temporary_tables is None: 9915 temporary_tables = [] 9916 if annotation_fields is None: 9917 annotation_fields = [] 9918 9919 for column_format in column_formats: 9920 9921 # annotation field and transcript annotation field 9922 annotation_field = column_format.get("transcripts_column", "ANN") 9923 transcript_annotation = column_format.get( 9924 "transcripts_infos_column", "Feature_ID" 9925 ) 9926 9927 # Temporary View name 9928 temporary_view_name = transcripts_table + "".join( 9929 random.choices(string.ascii_uppercase + string.digits, k=10) 9930 ) 9931 9932 # Create temporary view name 9933 temporary_view_name = self.annotation_format_to_table( 9934 uniquify=True, 9935 annotation_field=annotation_field, 9936 view_name=temporary_view_name, 9937 annotation_id=transcript_annotation, 9938 ) 9939 9940 # Annotation fields 9941 if temporary_view_name: 9942 query_annotation_fields = f""" 9943 SELECT * 9944 FROM ( 9945 DESCRIBE SELECT * 9946 FROM {temporary_view_name} 9947 ) 9948 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9949 """ 9950 df_annotation_fields = self.get_query_to_df( 9951 query=query_annotation_fields 9952 ) 9953 9954 # Add temporary view and annotation fields 9955 temporary_tables.append(temporary_view_name) 9956 annotation_fields += list(set(df_annotation_fields["column_name"])) 9957 9958 return temporary_tables, annotation_fields 9959 9960 def create_transcript_view( 9961 self, 9962 transcripts_table: str = None, 9963 transcripts_table_drop: bool = True, 9964 param: dict = {}, 9965 ) -> str: 9966 """ 9967 The `create_transcript_view` function generates a transcript view by processing data from a 9968 specified table based on provided parameters and structural information. 9969 9970 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9971 is used to specify the name of the table that will store the final transcript view data. If a table 9972 name is not provided, the function will create a new table to store the transcript view data, and by 9973 default,, defaults to transcripts 9974 :type transcripts_table: str (optional) 9975 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9976 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9977 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9978 the function will drop the existing transcripts table if it exists, defaults to True 9979 :type transcripts_table_drop: bool (optional) 9980 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9981 contains information needed to create a transcript view. It includes details such as the structure 9982 of the transcripts, columns mapping, column formats, and other necessary information for generating 9983 the view. This parameter allows for flexibility and customization 9984 :type param: dict 9985 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9986 created or modified during the execution of the function. 9987 """ 9988 9989 log.debug("Start transcripts view creation...") 9990 9991 # Default 9992 transcripts_table_default = "transcripts" 9993 9994 # Param 9995 if not param: 9996 param = self.get_param() 9997 9998 # Struct 9999 struct = param.get("transcripts", {}).get("struct", None) 10000 10001 if struct: 10002 10003 # Transcripts table 10004 if transcripts_table is None: 10005 transcripts_table = param.get("transcripts", {}).get( 10006 "table", transcripts_table_default 10007 ) 10008 10009 # added_columns 10010 added_columns = [] 10011 10012 # Temporary tables 10013 temporary_tables = [] 10014 10015 # Annotation fields 10016 annotation_fields = [] 10017 10018 # from columns map 10019 columns_maps = struct.get("from_columns_map", []) 10020 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10021 self.create_transcript_view_from_columns_map( 10022 transcripts_table=transcripts_table, 10023 columns_maps=columns_maps, 10024 added_columns=added_columns, 10025 temporary_tables=temporary_tables, 10026 annotation_fields=annotation_fields, 10027 ) 10028 ) 10029 added_columns += added_columns_tmp 10030 temporary_tables += temporary_tables_tmp 10031 annotation_fields += annotation_fields_tmp 10032 10033 # from column format 10034 column_formats = struct.get("from_column_format", []) 10035 temporary_tables_tmp, annotation_fields_tmp = ( 10036 self.create_transcript_view_from_column_format( 10037 transcripts_table=transcripts_table, 10038 column_formats=column_formats, 10039 temporary_tables=temporary_tables, 10040 annotation_fields=annotation_fields, 10041 ) 10042 ) 10043 temporary_tables += temporary_tables_tmp 10044 annotation_fields += annotation_fields_tmp 10045 10046 # Merge temporary tables query 10047 query_merge = "" 10048 for temporary_table in temporary_tables: 10049 10050 # First temporary table 10051 if not query_merge: 10052 query_merge = f""" 10053 SELECT * FROM {temporary_table} 10054 """ 10055 # other temporary table (using UNION) 10056 else: 10057 query_merge += f""" 10058 UNION BY NAME SELECT * FROM {temporary_table} 10059 """ 10060 10061 # Merge on transcript 10062 query_merge_on_transcripts_annotation_fields = [] 10063 # Aggregate all annotations fields 10064 for annotation_field in set(annotation_fields): 10065 query_merge_on_transcripts_annotation_fields.append( 10066 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10067 ) 10068 # Query for transcripts view 10069 query_merge_on_transcripts = f""" 10070 SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 10071 FROM ({query_merge}) 10072 GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript 10073 """ 10074 10075 # Drop transcript view is necessary 10076 if transcripts_table_drop: 10077 query_drop = f""" 10078 DROP TABLE IF EXISTS {transcripts_table}; 10079 """ 10080 self.execute_query(query=query_drop) 10081 10082 # Merge and create transcript view 10083 query_create_view = f""" 10084 CREATE TABLE IF NOT EXISTS {transcripts_table} 10085 AS {query_merge_on_transcripts} 10086 """ 10087 self.execute_query(query=query_create_view) 10088 10089 # Remove added columns 10090 for added_column in added_columns: 10091 self.drop_column(column=added_column) 10092 10093 else: 10094 10095 transcripts_table = None 10096 10097 return transcripts_table 10098 10099 def annotation_format_to_table( 10100 self, 10101 uniquify: bool = True, 10102 annotation_field: str = "ANN", 10103 annotation_id: str = "Feature_ID", 10104 view_name: str = "transcripts", 10105 ) -> str: 10106 """ 10107 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 10108 table format. 10109 10110 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 10111 values in the output or not. If set to `True`, the function will make sure that the output values 10112 are unique, defaults to True 10113 :type uniquify: bool (optional) 10114 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 10115 contains the annotation information for each variant. This field is used to extract the annotation 10116 details for further processing in the function, defaults to ANN 10117 :type annotation_field: str (optional) 10118 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 10119 used to specify the identifier for the annotation feature. This identifier will be used as a column 10120 name in the resulting table or view that is created based on the annotation data. It helps in 10121 uniquely identifying each annotation entry in the, defaults to Feature_ID 10122 :type annotation_id: str (optional) 10123 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 10124 specify the name of the temporary table that will be created to store the transformed annotation 10125 data. This table will hold the extracted information from the annotation field in a structured 10126 format for further processing or analysis, defaults to transcripts 10127 :type view_name: str (optional) 10128 :return: The function `annotation_format_to_table` is returning the name of the view created, which 10129 is stored in the variable `view_name`. 10130 """ 10131 10132 # Annotation field 10133 annotation_format = "annotation_explode" 10134 10135 # Transcript annotation 10136 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 10137 10138 # Prefix 10139 prefix = self.get_explode_infos_prefix() 10140 if prefix: 10141 prefix = "INFO/" 10142 10143 # Annotation fields 10144 annotation_infos = prefix + annotation_field 10145 annotation_format_infos = prefix + annotation_format 10146 10147 # Variants table 10148 table_variants = self.get_table_variants() 10149 10150 # Header 10151 vcf_reader = self.get_header() 10152 10153 # Add columns 10154 added_columns = [] 10155 10156 # Explode HGVS field in column 10157 added_columns += self.explode_infos(fields=[annotation_field]) 10158 10159 if annotation_field in vcf_reader.infos: 10160 10161 # Extract ANN header 10162 ann_description = vcf_reader.infos[annotation_field].desc 10163 pattern = r"'(.+?)'" 10164 match = re.search(pattern, ann_description) 10165 if match: 10166 ann_header_match = match.group(1).split(" | ") 10167 ann_header = [] 10168 ann_header_desc = {} 10169 for i in range(len(ann_header_match)): 10170 ann_header_info = "".join( 10171 char for char in ann_header_match[i] if char.isalnum() 10172 ) 10173 ann_header.append(ann_header_info) 10174 ann_header_desc[ann_header_info] = ann_header_match[i] 10175 if not ann_header_desc: 10176 raise ValueError("Invalid header description format") 10177 else: 10178 raise ValueError("Invalid header description format") 10179 10180 # Create variant id 10181 variant_id_column = self.get_variant_id_column() 10182 added_columns += [variant_id_column] 10183 10184 # Create dataframe 10185 dataframe_annotation_format = self.get_query_to_df( 10186 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 10187 ) 10188 10189 # Create annotation columns 10190 dataframe_annotation_format[ 10191 annotation_format_infos 10192 ] = dataframe_annotation_format[annotation_infos].apply( 10193 lambda x: explode_annotation_format( 10194 annotation=str(x), 10195 uniquify=uniquify, 10196 output_format="JSON", 10197 prefix="", 10198 header=list(ann_header_desc.values()), 10199 ) 10200 ) 10201 10202 # Find keys 10203 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 10204 df_keys = self.get_query_to_df(query=query_json) 10205 10206 # Check keys 10207 query_json_key = [] 10208 for _, row in df_keys.iterrows(): 10209 10210 # Key 10211 key = row.iloc[0] 10212 10213 # key_clean 10214 key_clean = "".join(char for char in key if char.isalnum()) 10215 10216 # Type 10217 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 10218 10219 # Get DataFrame from query 10220 df_json_type = self.get_query_to_df(query=query_json_type) 10221 10222 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 10223 with pd.option_context("future.no_silent_downcasting", True): 10224 df_json_type.fillna(value="", inplace=True) 10225 replace_dict = {None: np.nan, "": np.nan} 10226 df_json_type.replace(replace_dict, inplace=True) 10227 df_json_type.dropna(inplace=True) 10228 10229 # Detect column type 10230 column_type = detect_column_type(df_json_type[key_clean]) 10231 10232 # Append 10233 query_json_key.append( 10234 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 10235 ) 10236 10237 # Create view 10238 query_view = f""" 10239 CREATE TEMPORARY TABLE {view_name} 10240 AS ( 10241 SELECT *, {annotation_id} AS 'transcript' 10242 FROM ( 10243 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 10244 FROM dataframe_annotation_format 10245 ) 10246 ); 10247 """ 10248 self.execute_query(query=query_view) 10249 10250 else: 10251 10252 # Return None 10253 view_name = None 10254 10255 # Remove added columns 10256 for added_column in added_columns: 10257 self.drop_column(column=added_column) 10258 10259 return view_name 10260 10261 def transcript_view_to_variants( 10262 self, 10263 transcripts_table: str = None, 10264 transcripts_column_id: str = None, 10265 transcripts_info_json: str = None, 10266 transcripts_info_field_json: str = None, 10267 transcripts_info_format: str = None, 10268 transcripts_info_field_format: str = None, 10269 param: dict = {}, 10270 ) -> bool: 10271 """ 10272 The `transcript_view_to_variants` function updates a variants table with information from 10273 transcripts in JSON format. 10274 10275 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 10276 table containing the transcripts data. If this parameter is not provided, the function will 10277 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 10278 :type transcripts_table: str 10279 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 10280 column in the `transcripts_table` that contains the unique identifier for each transcript. This 10281 identifier is used to match transcripts with variants in the database 10282 :type transcripts_column_id: str 10283 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 10284 of the column in the variants table where the transcripts information will be stored in JSON 10285 format. This parameter allows you to define the column in the variants table that will hold the 10286 JSON-formatted information about transcripts 10287 :type transcripts_info_json: str 10288 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 10289 specify the field in the VCF header that will contain information about transcripts in JSON 10290 format. This field will be added to the VCF header as an INFO field with the specified name 10291 :type transcripts_info_field_json: str 10292 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 10293 format of the information about transcripts that will be stored in the variants table. This 10294 format can be used to define how the transcript information will be structured or displayed 10295 within the variants table 10296 :type transcripts_info_format: str 10297 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 10298 specify the field in the VCF header that will contain information about transcripts in a 10299 specific format. This field will be added to the VCF header as an INFO field with the specified 10300 name 10301 :type transcripts_info_field_format: str 10302 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 10303 that contains various configuration settings related to transcripts. It is used to provide 10304 default values for certain parameters if they are not explicitly provided when calling the 10305 method. The `param` dictionary can be passed as an argument 10306 :type param: dict 10307 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 10308 if the operation is successful and `False` if certain conditions are not met. 10309 """ 10310 10311 msg_info_prefix = "Start transcripts view to variants annotations" 10312 10313 log.debug(f"{msg_info_prefix}...") 10314 10315 # Default 10316 transcripts_table_default = "transcripts" 10317 transcripts_column_id_default = "transcript" 10318 transcripts_info_json_default = None 10319 transcripts_info_format_default = None 10320 transcripts_info_field_json_default = None 10321 transcripts_info_field_format_default = None 10322 10323 # Param 10324 if not param: 10325 param = self.get_param() 10326 10327 # Transcripts table 10328 if transcripts_table is None: 10329 transcripts_table = param.get("transcripts", {}).get( 10330 "table", transcripts_table_default 10331 ) 10332 10333 # Transcripts column ID 10334 if transcripts_column_id is None: 10335 transcripts_column_id = param.get("transcripts", {}).get( 10336 "column_id", transcripts_column_id_default 10337 ) 10338 10339 # Transcripts info json 10340 if transcripts_info_json is None: 10341 transcripts_info_json = param.get("transcripts", {}).get( 10342 "transcripts_info_json", transcripts_info_json_default 10343 ) 10344 10345 # Transcripts info field JSON 10346 if transcripts_info_field_json is None: 10347 transcripts_info_field_json = param.get("transcripts", {}).get( 10348 "transcripts_info_field_json", transcripts_info_field_json_default 10349 ) 10350 # if transcripts_info_field_json is not None and transcripts_info_json is None: 10351 # transcripts_info_json = transcripts_info_field_json 10352 10353 # Transcripts info format 10354 if transcripts_info_format is None: 10355 transcripts_info_format = param.get("transcripts", {}).get( 10356 "transcripts_info_format", transcripts_info_format_default 10357 ) 10358 10359 # Transcripts info field FORMAT 10360 if transcripts_info_field_format is None: 10361 transcripts_info_field_format = param.get("transcripts", {}).get( 10362 "transcripts_info_field_format", transcripts_info_field_format_default 10363 ) 10364 # if ( 10365 # transcripts_info_field_format is not None 10366 # and transcripts_info_format is None 10367 # ): 10368 # transcripts_info_format = transcripts_info_field_format 10369 10370 # Variants table 10371 table_variants = self.get_table_variants() 10372 10373 # Check info columns param 10374 if ( 10375 transcripts_info_json is None 10376 and transcripts_info_field_json is None 10377 and transcripts_info_format is None 10378 and transcripts_info_field_format is None 10379 ): 10380 return False 10381 10382 # Transcripts infos columns 10383 query_transcripts_infos_columns = f""" 10384 SELECT * 10385 FROM ( 10386 DESCRIBE SELECT * FROM {transcripts_table} 10387 ) 10388 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 10389 """ 10390 transcripts_infos_columns = list( 10391 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 10392 ) 10393 10394 # View results 10395 clause_select = [] 10396 clause_to_json = [] 10397 clause_to_format = [] 10398 for field in transcripts_infos_columns: 10399 clause_select.append( 10400 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10401 ) 10402 clause_to_json.append(f""" '{field}': "{field}" """) 10403 clause_to_format.append(f""" "{field}" """) 10404 10405 # Update 10406 update_set_json = [] 10407 update_set_format = [] 10408 10409 # VCF header 10410 vcf_reader = self.get_header() 10411 10412 # Transcripts to info column in JSON 10413 if transcripts_info_json is not None: 10414 10415 # Create column on variants table 10416 self.add_column( 10417 table_name=table_variants, 10418 column_name=transcripts_info_json, 10419 column_type="JSON", 10420 default_value=None, 10421 drop=False, 10422 ) 10423 10424 # Add header 10425 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 10426 transcripts_info_json, 10427 ".", 10428 "String", 10429 "Transcripts in JSON format", 10430 "unknwon", 10431 "unknwon", 10432 self.code_type_map["String"], 10433 ) 10434 10435 # Add to update 10436 update_set_json.append( 10437 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 10438 ) 10439 10440 # Transcripts to info field in JSON 10441 if transcripts_info_field_json is not None: 10442 10443 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 10444 10445 # Add to update 10446 update_set_json.append( 10447 f""" 10448 INFO = concat( 10449 CASE 10450 WHEN INFO NOT IN ('', '.') 10451 THEN INFO 10452 ELSE '' 10453 END, 10454 CASE 10455 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 10456 THEN concat( 10457 ';{transcripts_info_field_json}=', 10458 t.{transcripts_info_json} 10459 ) 10460 ELSE '' 10461 END 10462 ) 10463 """ 10464 ) 10465 10466 # Add header 10467 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 10468 transcripts_info_field_json, 10469 ".", 10470 "String", 10471 "Transcripts in JSON format", 10472 "unknwon", 10473 "unknwon", 10474 self.code_type_map["String"], 10475 ) 10476 10477 if update_set_json: 10478 10479 # Update query 10480 query_update = f""" 10481 UPDATE {table_variants} 10482 SET {", ".join(update_set_json)} 10483 FROM 10484 ( 10485 SELECT 10486 "#CHROM", POS, REF, ALT, 10487 concat( 10488 '{{', 10489 string_agg( 10490 '"' || "{transcripts_column_id}" || '":' || 10491 to_json(json_output) 10492 ), 10493 '}}' 10494 )::JSON AS {transcripts_info_json} 10495 FROM 10496 ( 10497 SELECT 10498 "#CHROM", POS, REF, ALT, 10499 "{transcripts_column_id}", 10500 to_json( 10501 {{{",".join(clause_to_json)}}} 10502 )::JSON AS json_output 10503 FROM 10504 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10505 WHERE "{transcripts_column_id}" IS NOT NULL 10506 ) 10507 GROUP BY "#CHROM", POS, REF, ALT 10508 ) AS t 10509 WHERE {table_variants}."#CHROM" = t."#CHROM" 10510 AND {table_variants}."POS" = t."POS" 10511 AND {table_variants}."REF" = t."REF" 10512 AND {table_variants}."ALT" = t."ALT" 10513 """ 10514 10515 self.execute_query(query=query_update) 10516 10517 # Transcripts to info column in FORMAT 10518 if transcripts_info_format is not None: 10519 10520 # Create column on variants table 10521 self.add_column( 10522 table_name=table_variants, 10523 column_name=transcripts_info_format, 10524 column_type="VARCHAR", 10525 default_value=None, 10526 drop=False, 10527 ) 10528 10529 # Add header 10530 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 10531 transcripts_info_format, 10532 ".", 10533 "String", 10534 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10535 "unknwon", 10536 "unknwon", 10537 self.code_type_map["String"], 10538 ) 10539 10540 # Add to update 10541 update_set_format.append( 10542 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 10543 ) 10544 10545 # Transcripts to info field in JSON 10546 if transcripts_info_field_format is not None: 10547 10548 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 10549 10550 # Add to update 10551 update_set_format.append( 10552 f""" 10553 INFO = concat( 10554 CASE 10555 WHEN INFO NOT IN ('', '.') 10556 THEN INFO 10557 ELSE '' 10558 END, 10559 CASE 10560 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 10561 THEN concat( 10562 ';{transcripts_info_field_format}=', 10563 t.{transcripts_info_format} 10564 ) 10565 ELSE '' 10566 END 10567 ) 10568 """ 10569 ) 10570 10571 # Add header 10572 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 10573 transcripts_info_field_format, 10574 ".", 10575 "String", 10576 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10577 "unknwon", 10578 "unknwon", 10579 self.code_type_map["String"], 10580 ) 10581 10582 if update_set_format: 10583 10584 # Update query 10585 query_update = f""" 10586 UPDATE {table_variants} 10587 SET {", ".join(update_set_format)} 10588 FROM 10589 ( 10590 SELECT 10591 "#CHROM", POS, REF, ALT, 10592 string_agg({transcripts_info_format}) AS {transcripts_info_format} 10593 FROM 10594 ( 10595 SELECT 10596 "#CHROM", POS, REF, ALT, 10597 "{transcripts_column_id}", 10598 concat( 10599 "{transcripts_column_id}", 10600 '|', 10601 {", '|', ".join(clause_to_format)} 10602 ) AS {transcripts_info_format} 10603 FROM 10604 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10605 ) 10606 GROUP BY "#CHROM", POS, REF, ALT 10607 ) AS t 10608 WHERE {table_variants}."#CHROM" = t."#CHROM" 10609 AND {table_variants}."POS" = t."POS" 10610 AND {table_variants}."REF" = t."REF" 10611 AND {table_variants}."ALT" = t."ALT" 10612 """ 10613 10614 self.execute_query(query=query_update) 10615 10616 return True
34class Variants: 35 36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Samples 78 self.set_samples() 79 80 # Load data 81 if load: 82 self.load_data() 83 84 def set_samples(self, samples: list = None) -> list: 85 """ 86 The function `set_samples` sets the samples attribute of an object to a provided list or 87 retrieves it from a parameter dictionary. 88 89 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 90 input and sets the `samples` attribute of the class to the provided list. If no samples are 91 provided, it tries to get the samples from the class's parameters using the `get_param` method 92 :type samples: list 93 :return: The `samples` list is being returned. 94 """ 95 96 if not samples: 97 samples = self.get_param().get("samples", {}).get("list", None) 98 99 self.samples = samples 100 101 return samples 102 103 def get_samples(self) -> list: 104 """ 105 This function returns a list of samples. 106 :return: The `get_samples` method is returning the `samples` attribute of the object. 107 """ 108 109 return self.samples 110 111 def get_samples_check(self) -> bool: 112 """ 113 This function returns the value of the "check" key within the "samples" dictionary retrieved 114 from the parameters. 115 :return: The method `get_samples_check` is returning the value of the key "check" inside the 116 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 117 method. If the key "check" is not found, it will return `False`. 118 """ 119 120 return self.get_param().get("samples", {}).get("check", True) 121 122 def set_input(self, input: str = None) -> None: 123 """ 124 The function `set_input` takes a file name as input, extracts the name and extension, and sets 125 attributes in the class accordingly. 126 127 :param input: The `set_input` method in the provided code snippet is used to set attributes 128 related to the input file. Here's a breakdown of the parameters and their usage in the method: 129 :type input: str 130 """ 131 132 if input and not isinstance(input, str): 133 try: 134 self.input = input.name 135 except: 136 log.error(f"Input file '{input} in bad format") 137 raise ValueError(f"Input file '{input} in bad format") 138 else: 139 self.input = input 140 141 # Input format 142 if input: 143 input_name, input_extension = os.path.splitext(self.input) 144 self.input_name = input_name 145 self.input_extension = input_extension 146 self.input_format = self.input_extension.replace(".", "") 147 148 def set_config(self, config: dict) -> None: 149 """ 150 The set_config function takes a config object and assigns it as the configuration object for the 151 class. 152 153 :param config: The `config` parameter in the `set_config` function is a dictionary object that 154 contains configuration settings for the class. When you call the `set_config` function with a 155 dictionary object as the argument, it will set that dictionary as the configuration object for 156 the class 157 :type config: dict 158 """ 159 160 self.config = config 161 162 def set_param(self, param: dict) -> None: 163 """ 164 This function sets a parameter object for the class based on the input dictionary. 165 166 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 167 as the `param` attribute of the class instance 168 :type param: dict 169 """ 170 171 self.param = param 172 173 def init_variables(self) -> None: 174 """ 175 This function initializes the variables that will be used in the rest of the class 176 """ 177 178 self.prefix = "howard" 179 self.table_variants = "variants" 180 self.dataframe = None 181 182 self.comparison_map = { 183 "gt": ">", 184 "gte": ">=", 185 "lt": "<", 186 "lte": "<=", 187 "equals": "=", 188 "contains": "SIMILAR TO", 189 } 190 191 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 192 193 self.code_type_map_to_sql = { 194 "Integer": "INTEGER", 195 "String": "VARCHAR", 196 "Float": "FLOAT", 197 "Flag": "VARCHAR", 198 } 199 200 self.index_additionnal_fields = [] 201 202 def get_indexing(self) -> bool: 203 """ 204 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 205 returns False. 206 :return: The value of the indexing parameter. 207 """ 208 209 return self.get_param().get("indexing", False) 210 211 def get_connexion_config(self) -> dict: 212 """ 213 The function `get_connexion_config` returns a dictionary containing the configuration for a 214 connection, including the number of threads and memory limit. 215 :return: a dictionary containing the configuration for the Connexion library. 216 """ 217 218 # config 219 config = self.get_config() 220 221 # Connexion config 222 connexion_config = {} 223 threads = self.get_threads() 224 225 # Threads 226 if threads: 227 connexion_config["threads"] = threads 228 229 # Memory 230 # if config.get("memory", None): 231 # connexion_config["memory_limit"] = config.get("memory") 232 if self.get_memory(): 233 connexion_config["memory_limit"] = self.get_memory() 234 235 # Temporary directory 236 if config.get("tmp", None): 237 connexion_config["temp_directory"] = config.get("tmp") 238 239 # Access 240 if config.get("access", None): 241 access = config.get("access") 242 if access in ["RO"]: 243 access = "READ_ONLY" 244 elif access in ["RW"]: 245 access = "READ_WRITE" 246 connexion_db = self.get_connexion_db() 247 if connexion_db in ":memory:": 248 access = "READ_WRITE" 249 connexion_config["access_mode"] = access 250 251 return connexion_config 252 253 def get_duckdb_settings(self) -> dict: 254 """ 255 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 256 string. 257 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 258 """ 259 260 # config 261 config = self.get_config() 262 263 # duckdb settings 264 duckdb_settings_dict = {} 265 if config.get("duckdb_settings", None): 266 duckdb_settings = config.get("duckdb_settings") 267 duckdb_settings = full_path(duckdb_settings) 268 # duckdb setting is a file 269 if os.path.exists(duckdb_settings): 270 with open(duckdb_settings) as json_file: 271 duckdb_settings_dict = yaml.safe_load(json_file) 272 # duckdb settings is a string 273 else: 274 duckdb_settings_dict = json.loads(duckdb_settings) 275 276 return duckdb_settings_dict 277 278 def set_connexion_db(self) -> str: 279 """ 280 The function `set_connexion_db` returns the appropriate database connection string based on the 281 input format and connection type. 282 :return: the value of the variable `connexion_db`. 283 """ 284 285 # Default connexion db 286 default_connexion_db = ":memory:" 287 288 # Find connexion db 289 if self.get_input_format() in ["db", "duckdb"]: 290 connexion_db = self.get_input() 291 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 292 connexion_db = default_connexion_db 293 elif self.get_connexion_type() in ["tmpfile"]: 294 tmp_name = tempfile.mkdtemp( 295 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 296 ) 297 connexion_db = f"{tmp_name}/tmp.db" 298 elif self.get_connexion_type() != "": 299 connexion_db = self.get_connexion_type() 300 else: 301 connexion_db = default_connexion_db 302 303 # Set connexion db 304 self.connexion_db = connexion_db 305 306 return connexion_db 307 308 def set_connexion(self, conn) -> None: 309 """ 310 The function `set_connexion` creates a connection to a database, with options for different 311 database formats and settings. 312 313 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 314 database. If a connection is not provided, a new connection to an in-memory database is created. 315 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 316 sqlite 317 """ 318 319 # Connexion db 320 connexion_db = self.set_connexion_db() 321 322 # Connexion config 323 connexion_config = self.get_connexion_config() 324 325 # Connexion format 326 connexion_format = self.get_config().get("connexion_format", "duckdb") 327 # Set connexion format 328 self.connexion_format = connexion_format 329 330 # Connexion 331 if not conn: 332 if connexion_format in ["duckdb"]: 333 conn = duckdb.connect(connexion_db, config=connexion_config) 334 # duckDB settings 335 duckdb_settings = self.get_duckdb_settings() 336 if duckdb_settings: 337 for setting in duckdb_settings: 338 setting_value = duckdb_settings.get(setting) 339 if isinstance(setting_value, str): 340 setting_value = f"'{setting_value}'" 341 conn.execute(f"PRAGMA {setting}={setting_value};") 342 elif connexion_format in ["sqlite"]: 343 conn = sqlite3.connect(connexion_db) 344 345 # Set connexion 346 self.conn = conn 347 348 # Log 349 log.debug(f"connexion_format: {connexion_format}") 350 log.debug(f"connexion_db: {connexion_db}") 351 log.debug(f"connexion config: {connexion_config}") 352 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 353 354 def set_output(self, output: str = None) -> None: 355 """ 356 The `set_output` function in Python sets the output file based on the input or a specified key 357 in the config file, extracting the output name, extension, and format. 358 359 :param output: The `output` parameter in the `set_output` method is used to specify the name of 360 the output file. If the config file has an 'output' key, the method sets the output to the value 361 of that key. If no output is provided, it sets the output to `None` 362 :type output: str 363 """ 364 365 if output and not isinstance(output, str): 366 self.output = output.name 367 else: 368 self.output = output 369 370 # Output format 371 if self.output: 372 output_name, output_extension = os.path.splitext(self.output) 373 self.output_name = output_name 374 self.output_extension = output_extension 375 self.output_format = self.output_extension.replace(".", "") 376 else: 377 self.output_name = None 378 self.output_extension = None 379 self.output_format = None 380 381 def set_header(self) -> None: 382 """ 383 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 384 """ 385 386 input_file = self.get_input() 387 default_header_list = [ 388 "##fileformat=VCFv4.2", 389 "#CHROM POS ID REF ALT QUAL FILTER INFO", 390 ] 391 392 # Full path 393 input_file = full_path(input_file) 394 395 if input_file: 396 397 input_format = self.get_input_format() 398 input_compressed = self.get_input_compressed() 399 config = self.get_config() 400 header_list = default_header_list 401 if input_format in [ 402 "vcf", 403 "hdr", 404 "tsv", 405 "csv", 406 "psv", 407 "parquet", 408 "db", 409 "duckdb", 410 ]: 411 # header provided in param 412 if config.get("header_file", None): 413 with open(config.get("header_file"), "rt") as f: 414 header_list = self.read_vcf_header(f) 415 # within a vcf file format (header within input file itsself) 416 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 417 # within a compressed vcf file format (.vcf.gz) 418 if input_compressed: 419 with bgzf.open(input_file, "rt") as f: 420 header_list = self.read_vcf_header(f) 421 # within an uncompressed vcf file format (.vcf) 422 else: 423 with open(input_file, "rt") as f: 424 header_list = self.read_vcf_header(f) 425 # header provided in default external file .hdr 426 elif os.path.exists((input_file + ".hdr")): 427 with open(input_file + ".hdr", "rt") as f: 428 header_list = self.read_vcf_header(f) 429 else: 430 try: # Try to get header info fields and file columns 431 432 with tempfile.TemporaryDirectory() as tmpdir: 433 434 # Create database 435 db_for_header = Database(database=input_file) 436 437 # Get header columns for infos fields 438 db_header_from_columns = ( 439 db_for_header.get_header_from_columns() 440 ) 441 442 # Get real columns in the file 443 db_header_columns = db_for_header.get_columns() 444 445 # Write header file 446 header_file_tmp = os.path.join(tmpdir, "header") 447 f = open(header_file_tmp, "w") 448 vcf.Writer(f, db_header_from_columns) 449 f.close() 450 451 # Replace #CHROM line with rel columns 452 header_list = db_for_header.read_header_file( 453 header_file=header_file_tmp 454 ) 455 header_list[-1] = "\t".join(db_header_columns) 456 457 except: 458 459 log.warning( 460 f"No header for file {input_file}. Set as default VCF header" 461 ) 462 header_list = default_header_list 463 464 else: # try for unknown format ? 465 466 log.error(f"Input file format '{input_format}' not available") 467 raise ValueError(f"Input file format '{input_format}' not available") 468 469 if not header_list: 470 header_list = default_header_list 471 472 # header as list 473 self.header_list = header_list 474 475 # header as VCF object 476 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 477 478 else: 479 480 self.header_list = None 481 self.header_vcf = None 482 483 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 484 """ 485 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 486 DataFrame based on the connection format. 487 488 :param query: The `query` parameter in the `get_query_to_df` function is a string that 489 represents the SQL query you want to execute. This query will be used to fetch data from a 490 database and convert it into a pandas DataFrame 491 :type query: str 492 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 493 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 494 function will only fetch up to that number of rows from the database query result. If no limit 495 is specified, 496 :type limit: int 497 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 498 """ 499 500 # Connexion format 501 connexion_format = self.get_connexion_format() 502 503 # Limit in query 504 if limit: 505 pd.set_option("display.max_rows", limit) 506 if connexion_format in ["duckdb"]: 507 df = ( 508 self.conn.execute(query) 509 .fetch_record_batch(limit) 510 .read_next_batch() 511 .to_pandas() 512 ) 513 elif connexion_format in ["sqlite"]: 514 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 515 516 # Full query 517 else: 518 if connexion_format in ["duckdb"]: 519 df = self.conn.execute(query).df() 520 elif connexion_format in ["sqlite"]: 521 df = pd.read_sql_query(query, self.conn) 522 523 return df 524 525 def get_overview(self) -> None: 526 """ 527 The function prints the input, output, config, and dataframe of the current object 528 """ 529 table_variants_from = self.get_table_variants(clause="from") 530 sql_columns = self.get_header_columns_as_sql() 531 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 532 df = self.get_query_to_df(sql_query_export) 533 log.info( 534 "Input: " 535 + str(self.get_input()) 536 + " [" 537 + str(str(self.get_input_format())) 538 + "]" 539 ) 540 log.info( 541 "Output: " 542 + str(self.get_output()) 543 + " [" 544 + str(str(self.get_output_format())) 545 + "]" 546 ) 547 log.info("Config: ") 548 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 549 "\n" 550 ): 551 log.info("\t" + str(d)) 552 log.info("Param: ") 553 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 554 "\n" 555 ): 556 log.info("\t" + str(d)) 557 log.info("Sample list: " + str(self.get_header_sample_list())) 558 log.info("Dataframe: ") 559 for d in str(df).split("\n"): 560 log.info("\t" + str(d)) 561 562 # garbage collector 563 del df 564 gc.collect() 565 566 return None 567 568 def get_stats(self) -> dict: 569 """ 570 The `get_stats` function calculates and returns various statistics of the current object, 571 including information about the input file, variants, samples, header fields, quality, and 572 SNVs/InDels. 573 :return: a dictionary containing various statistics of the current object. The dictionary has 574 the following structure: 575 """ 576 577 # Log 578 log.info(f"Stats Calculation...") 579 580 # table varaints 581 table_variants_from = self.get_table_variants() 582 583 # stats dict 584 stats = {"Infos": {}} 585 586 ### File 587 input_file = self.get_input() 588 stats["Infos"]["Input file"] = input_file 589 590 # Header 591 header_infos = self.get_header().infos 592 header_formats = self.get_header().formats 593 header_infos_list = list(header_infos) 594 header_formats_list = list(header_formats) 595 596 ### Variants 597 598 stats["Variants"] = {} 599 600 # Variants by chr 601 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 602 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 603 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 604 by=["CHROM"], kind="quicksort" 605 ) 606 607 # Total number of variants 608 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 609 610 # Calculate percentage 611 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 612 lambda x: (x / nb_of_variants) 613 ) 614 615 stats["Variants"]["Number of variants by chromosome"] = ( 616 nb_of_variants_by_chrom.to_dict(orient="index") 617 ) 618 619 stats["Infos"]["Number of variants"] = int(nb_of_variants) 620 621 ### Samples 622 623 # Init 624 samples = {} 625 nb_of_samples = 0 626 627 # Check Samples 628 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 629 log.debug(f"Check samples...") 630 for sample in self.get_header_sample_list(): 631 sql_query_samples = f""" 632 SELECT '{sample}' as sample, 633 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 634 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 635 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 636 FROM {table_variants_from} 637 WHERE ( 638 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 639 AND 640 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 641 ) 642 GROUP BY genotype 643 """ 644 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 645 sample_genotype_count = sql_query_genotype_df["count"].sum() 646 if len(sql_query_genotype_df): 647 nb_of_samples += 1 648 samples[f"{sample} - {sample_genotype_count} variants"] = ( 649 sql_query_genotype_df.to_dict(orient="index") 650 ) 651 652 stats["Samples"] = samples 653 stats["Infos"]["Number of samples"] = nb_of_samples 654 655 # # 656 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 657 # stats["Infos"]["Number of samples"] = nb_of_samples 658 # elif nb_of_samples: 659 # stats["Infos"]["Number of samples"] = "not a VCF format" 660 661 ### INFO and FORMAT fields 662 header_types_df = {} 663 header_types_list = { 664 "List of INFO fields": header_infos, 665 "List of FORMAT fields": header_formats, 666 } 667 i = 0 668 for header_type in header_types_list: 669 670 header_type_infos = header_types_list.get(header_type) 671 header_infos_dict = {} 672 673 for info in header_type_infos: 674 675 i += 1 676 header_infos_dict[i] = {} 677 678 # ID 679 header_infos_dict[i]["id"] = info 680 681 # num 682 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 683 if header_type_infos[info].num in genotype_map.keys(): 684 header_infos_dict[i]["Number"] = genotype_map.get( 685 header_type_infos[info].num 686 ) 687 else: 688 header_infos_dict[i]["Number"] = header_type_infos[info].num 689 690 # type 691 if header_type_infos[info].type: 692 header_infos_dict[i]["Type"] = header_type_infos[info].type 693 else: 694 header_infos_dict[i]["Type"] = "." 695 696 # desc 697 if header_type_infos[info].desc != None: 698 header_infos_dict[i]["Description"] = header_type_infos[info].desc 699 else: 700 header_infos_dict[i]["Description"] = "" 701 702 if len(header_infos_dict): 703 header_types_df[header_type] = pd.DataFrame.from_dict( 704 header_infos_dict, orient="index" 705 ).to_dict(orient="index") 706 707 # Stats 708 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 709 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 710 stats["Header"] = header_types_df 711 712 ### QUAL 713 if "QUAL" in self.get_header_columns(): 714 sql_query_qual = f""" 715 SELECT 716 avg(CAST(QUAL AS INTEGER)) AS Average, 717 min(CAST(QUAL AS INTEGER)) AS Minimum, 718 max(CAST(QUAL AS INTEGER)) AS Maximum, 719 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 720 median(CAST(QUAL AS INTEGER)) AS Median, 721 variance(CAST(QUAL AS INTEGER)) AS Variance 722 FROM {table_variants_from} 723 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 724 """ 725 726 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 727 stats["Quality"] = {"Stats": qual} 728 729 ### SNV and InDel 730 731 sql_query_snv = f""" 732 733 SELECT Type, count FROM ( 734 735 SELECT 736 'Total' AS Type, 737 count(*) AS count 738 FROM {table_variants_from} 739 740 UNION 741 742 SELECT 743 'MNV' AS Type, 744 count(*) AS count 745 FROM {table_variants_from} 746 WHERE len(REF) > 1 AND len(ALT) > 1 747 AND len(REF) = len(ALT) 748 749 UNION 750 751 SELECT 752 'InDel' AS Type, 753 count(*) AS count 754 FROM {table_variants_from} 755 WHERE len(REF) > 1 OR len(ALT) > 1 756 AND len(REF) != len(ALT) 757 758 UNION 759 760 SELECT 761 'SNV' AS Type, 762 count(*) AS count 763 FROM {table_variants_from} 764 WHERE len(REF) = 1 AND len(ALT) = 1 765 766 ) 767 768 ORDER BY count DESC 769 770 """ 771 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 772 773 sql_query_snv_substitution = f""" 774 SELECT 775 concat(REF, '>', ALT) AS 'Substitution', 776 count(*) AS count 777 FROM {table_variants_from} 778 WHERE len(REF) = 1 AND len(ALT) = 1 779 GROUP BY REF, ALT 780 ORDER BY count(*) DESC 781 """ 782 snv_substitution = ( 783 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 784 ) 785 stats["Variants"]["Counts"] = snv_indel 786 stats["Variants"]["Substitutions"] = snv_substitution 787 788 return stats 789 790 def stats_to_file(self, file: str = None) -> str: 791 """ 792 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 793 into a JSON object, and writes the JSON object to the specified file. 794 795 :param file: The `file` parameter is a string that represents the file path where the JSON data 796 will be written 797 :type file: str 798 :return: the name of the file that was written to. 799 """ 800 801 # Get stats 802 stats = self.get_stats() 803 804 # Serializing json 805 json_object = json.dumps(stats, indent=4) 806 807 # Writing to sample.json 808 with open(file, "w") as outfile: 809 outfile.write(json_object) 810 811 return file 812 813 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 814 """ 815 The `print_stats` function generates a markdown file and prints the statistics contained in a 816 JSON file in a formatted manner. 817 818 :param output_file: The `output_file` parameter is a string that specifies the path and filename 819 of the output file where the stats will be printed in Markdown format. If no `output_file` is 820 provided, a temporary directory will be created and the stats will be saved in a file named 821 "stats.md" within that 822 :type output_file: str 823 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 824 file where the statistics will be saved. If no value is provided, a temporary directory will be 825 created and a default file name "stats.json" will be used 826 :type json_file: str 827 :return: The function `print_stats` does not return any value. It has a return type annotation 828 of `None`. 829 """ 830 831 # Full path 832 output_file = full_path(output_file) 833 json_file = full_path(json_file) 834 835 with tempfile.TemporaryDirectory() as tmpdir: 836 837 # Files 838 if not output_file: 839 output_file = os.path.join(tmpdir, "stats.md") 840 if not json_file: 841 json_file = os.path.join(tmpdir, "stats.json") 842 843 # Create folders 844 if not os.path.exists(os.path.dirname(output_file)): 845 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 846 if not os.path.exists(os.path.dirname(json_file)): 847 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 848 849 # Create stats JSON file 850 stats_file = self.stats_to_file(file=json_file) 851 852 # Print stats file 853 with open(stats_file) as f: 854 stats = yaml.safe_load(f) 855 856 # Output 857 output_title = [] 858 output_index = [] 859 output = [] 860 861 # Title 862 output_title.append("# HOWARD Stats") 863 864 # Index 865 output_index.append("## Index") 866 867 # Process sections 868 for section in stats: 869 infos = stats.get(section) 870 section_link = "#" + section.lower().replace(" ", "-") 871 output.append(f"## {section}") 872 output_index.append(f"- [{section}]({section_link})") 873 874 if len(infos): 875 for info in infos: 876 try: 877 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 878 is_df = True 879 except: 880 try: 881 df = pd.DataFrame.from_dict( 882 json.loads((infos.get(info))), orient="index" 883 ) 884 is_df = True 885 except: 886 is_df = False 887 if is_df: 888 output.append(f"### {info}") 889 info_link = "#" + info.lower().replace(" ", "-") 890 output_index.append(f" - [{info}]({info_link})") 891 output.append(f"{df.to_markdown(index=False)}") 892 else: 893 output.append(f"- {info}: {infos.get(info)}") 894 else: 895 output.append(f"NA") 896 897 # Write stats in markdown file 898 with open(output_file, "w") as fp: 899 for item in output_title: 900 fp.write("%s\n" % item) 901 for item in output_index: 902 fp.write("%s\n" % item) 903 for item in output: 904 fp.write("%s\n" % item) 905 906 # Output stats in markdown 907 print("") 908 print("\n\n".join(output_title)) 909 print("") 910 print("\n\n".join(output)) 911 print("") 912 913 return None 914 915 def get_input(self) -> str: 916 """ 917 It returns the value of the input variable. 918 :return: The input is being returned. 919 """ 920 return self.input 921 922 def get_input_format(self, input_file: str = None) -> str: 923 """ 924 This function returns the format of the input variable, either from the provided input file or 925 by prompting for input. 926 927 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 928 represents the file path of the input file. If no `input_file` is provided when calling the 929 method, it will default to `None` 930 :type input_file: str 931 :return: The format of the input variable is being returned. 932 """ 933 934 if not input_file: 935 input_file = self.get_input() 936 input_format = get_file_format(input_file) 937 return input_format 938 939 def get_input_compressed(self, input_file: str = None) -> str: 940 """ 941 The function `get_input_compressed` returns the format of the input variable after compressing 942 it. 943 944 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 945 that represents the file path of the input file. If no `input_file` is provided when calling the 946 method, it will default to `None` and the method will then call `self.get_input()` to 947 :type input_file: str 948 :return: The function `get_input_compressed` returns the compressed format of the input 949 variable. 950 """ 951 952 if not input_file: 953 input_file = self.get_input() 954 input_compressed = get_file_compressed(input_file) 955 return input_compressed 956 957 def get_output(self) -> str: 958 """ 959 It returns the output of the neuron. 960 :return: The output of the neural network. 961 """ 962 963 return self.output 964 965 def get_output_format(self, output_file: str = None) -> str: 966 """ 967 The function `get_output_format` returns the format of the input variable or the output file if 968 provided. 969 970 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 971 that represents the file path of the output file. If no `output_file` is provided when calling 972 the method, it will default to the output obtained from the `get_output` method of the class 973 instance. The 974 :type output_file: str 975 :return: The format of the input variable is being returned. 976 """ 977 978 if not output_file: 979 output_file = self.get_output() 980 output_format = get_file_format(output_file) 981 982 return output_format 983 984 def get_config(self) -> dict: 985 """ 986 It returns the config 987 :return: The config variable is being returned. 988 """ 989 return self.config 990 991 def get_param(self) -> dict: 992 """ 993 It returns the param 994 :return: The param variable is being returned. 995 """ 996 return self.param 997 998 def get_connexion_db(self) -> str: 999 """ 1000 It returns the connexion_db attribute of the object 1001 :return: The connexion_db is being returned. 1002 """ 1003 return self.connexion_db 1004 1005 def get_prefix(self) -> str: 1006 """ 1007 It returns the prefix of the object. 1008 :return: The prefix is being returned. 1009 """ 1010 return self.prefix 1011 1012 def get_table_variants(self, clause: str = "select") -> str: 1013 """ 1014 This function returns the table_variants attribute of the object 1015 1016 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1017 defaults to select (optional) 1018 :return: The table_variants attribute of the object. 1019 """ 1020 1021 # Access 1022 access = self.get_config().get("access", None) 1023 1024 # Clauses "select", "where", "update" 1025 if clause in ["select", "where", "update"]: 1026 table_variants = self.table_variants 1027 # Clause "from" 1028 elif clause in ["from"]: 1029 # For Read Only 1030 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1031 input_file = self.get_input() 1032 table_variants = f"'{input_file}' as variants" 1033 # For Read Write 1034 else: 1035 table_variants = f"{self.table_variants} as variants" 1036 else: 1037 table_variants = self.table_variants 1038 return table_variants 1039 1040 def get_tmp_dir(self) -> str: 1041 """ 1042 The function `get_tmp_dir` returns the temporary directory path based on configuration 1043 parameters or a default path. 1044 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1045 configuration, parameters, and a default value of "/tmp". 1046 """ 1047 1048 return get_tmp( 1049 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1050 ) 1051 1052 def get_connexion_type(self) -> str: 1053 """ 1054 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1055 1056 :return: The connexion type is being returned. 1057 """ 1058 return self.get_config().get("connexion_type", "memory") 1059 1060 def get_connexion(self): 1061 """ 1062 It returns the connection object 1063 1064 :return: The connection object. 1065 """ 1066 return self.conn 1067 1068 def close_connexion(self) -> None: 1069 """ 1070 This function closes the connection to the database. 1071 :return: The connection is being closed. 1072 """ 1073 return self.conn.close() 1074 1075 def get_header(self, type: str = "vcf"): 1076 """ 1077 This function returns the header of the VCF file as a list of strings 1078 1079 :param type: the type of header you want to get, defaults to vcf (optional) 1080 :return: The header of the vcf file. 1081 """ 1082 1083 if self.header_vcf: 1084 if type == "vcf": 1085 return self.header_vcf 1086 elif type == "list": 1087 return self.header_list 1088 else: 1089 if type == "vcf": 1090 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1091 return header 1092 elif type == "list": 1093 return vcf_required 1094 1095 def get_header_length(self, file: str = None) -> int: 1096 """ 1097 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1098 line. 1099 1100 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1101 header file. If this argument is provided, the function will read the header from the specified 1102 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1103 :type file: str 1104 :return: the length of the header list, excluding the #CHROM line. 1105 """ 1106 1107 if file: 1108 return len(self.read_vcf_header_file(file=file)) - 1 1109 elif self.get_header(type="list"): 1110 return len(self.get_header(type="list")) - 1 1111 else: 1112 return 0 1113 1114 def get_header_columns(self) -> str: 1115 """ 1116 This function returns the header list of a VCF 1117 1118 :return: The length of the header list. 1119 """ 1120 if self.get_header(): 1121 return self.get_header(type="list")[-1] 1122 else: 1123 return "" 1124 1125 def get_header_columns_as_list(self) -> list: 1126 """ 1127 This function returns the header list of a VCF 1128 1129 :return: The length of the header list. 1130 """ 1131 if self.get_header(): 1132 return self.get_header_columns().strip().split("\t") 1133 else: 1134 return [] 1135 1136 def get_header_columns_as_sql(self) -> str: 1137 """ 1138 This function retruns header length (without #CHROM line) 1139 1140 :return: The length of the header list. 1141 """ 1142 sql_column_list = [] 1143 for col in self.get_header_columns_as_list(): 1144 sql_column_list.append(f'"{col}"') 1145 return ",".join(sql_column_list) 1146 1147 def get_header_sample_list( 1148 self, check: bool = False, samples: list = None, samples_force: bool = False 1149 ) -> list: 1150 """ 1151 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1152 checking and filtering based on input parameters. 1153 1154 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1155 parameter that determines whether to check if the samples in the list are properly defined as 1156 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1157 list is defined as a, defaults to False 1158 :type check: bool (optional) 1159 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1160 allows you to specify a subset of samples from the header. If you provide a list of sample 1161 names, the function will check if each sample is defined in the header. If a sample is not found 1162 in the 1163 :type samples: list 1164 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1165 a boolean parameter that determines whether to force the function to return the sample list 1166 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1167 function will return the sample list without performing, defaults to False 1168 :type samples_force: bool (optional) 1169 :return: The function `get_header_sample_list` returns a list of samples based on the input 1170 parameters and conditions specified in the function. 1171 """ 1172 1173 # Init 1174 samples_list = [] 1175 1176 if samples is None: 1177 samples_list = self.header_vcf.samples 1178 else: 1179 samples_checked = [] 1180 for sample in samples: 1181 if sample in self.header_vcf.samples: 1182 samples_checked.append(sample) 1183 else: 1184 log.warning(f"Sample '{sample}' not defined in header") 1185 samples_list = samples_checked 1186 1187 # Force sample list without checking if is_genotype_column 1188 if samples_force: 1189 log.warning(f"Samples {samples_list} not checked if genotypes") 1190 return samples_list 1191 1192 if check: 1193 samples_checked = [] 1194 for sample in samples_list: 1195 if self.is_genotype_column(column=sample): 1196 samples_checked.append(sample) 1197 else: 1198 log.warning( 1199 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1200 ) 1201 samples_list = samples_checked 1202 1203 # Return samples list 1204 return samples_list 1205 1206 def is_genotype_column(self, column: str = None) -> bool: 1207 """ 1208 This function checks if a given column is a genotype column in a database. 1209 1210 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1211 represents the column name in a database table. This method checks if the specified column is a 1212 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1213 method of 1214 :type column: str 1215 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1216 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1217 column name and returns the result. If the `column` parameter is None, it returns False. 1218 """ 1219 1220 if column is not None: 1221 return Database(database=self.get_input()).is_genotype_column(column=column) 1222 else: 1223 return False 1224 1225 def get_verbose(self) -> bool: 1226 """ 1227 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1228 exist 1229 1230 :return: The value of the key "verbose" in the config dictionary. 1231 """ 1232 return self.get_config().get("verbose", False) 1233 1234 def get_connexion_format(self) -> str: 1235 """ 1236 It returns the connexion format of the object. 1237 :return: The connexion_format is being returned. 1238 """ 1239 connexion_format = self.connexion_format 1240 if connexion_format not in ["duckdb", "sqlite"]: 1241 log.error(f"Unknown connexion format {connexion_format}") 1242 raise ValueError(f"Unknown connexion format {connexion_format}") 1243 else: 1244 return connexion_format 1245 1246 def insert_file_to_table( 1247 self, 1248 file, 1249 columns: str, 1250 header_len: int = 0, 1251 sep: str = "\t", 1252 chunksize: int = 1000000, 1253 ) -> None: 1254 """ 1255 The function reads a file in chunks and inserts each chunk into a table based on the specified 1256 database format. 1257 1258 :param file: The `file` parameter is the file that you want to load into a table. It should be 1259 the path to the file on your system 1260 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1261 should contain the names of the columns in the table where the data will be inserted. The column 1262 names should be separated by commas within the string. For example, if you have columns named 1263 "id", "name 1264 :type columns: str 1265 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1266 the number of lines to skip at the beginning of the file before reading the actual data. This 1267 parameter allows you to skip any header information present in the file before processing the 1268 data, defaults to 0 1269 :type header_len: int (optional) 1270 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1271 separator character that is used in the file being read. In this case, the default separator is 1272 set to `\t`, which represents a tab character. You can change this parameter to a different 1273 separator character if, defaults to \t 1274 :type sep: str (optional) 1275 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1276 when processing the file in chunks. In the provided code snippet, the default value for 1277 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1278 to 1000000 1279 :type chunksize: int (optional) 1280 """ 1281 1282 # Config 1283 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1284 connexion_format = self.get_connexion_format() 1285 1286 log.debug("chunksize: " + str(chunksize)) 1287 1288 if chunksize: 1289 for chunk in pd.read_csv( 1290 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1291 ): 1292 if connexion_format in ["duckdb"]: 1293 sql_insert_into = ( 1294 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1295 ) 1296 self.conn.execute(sql_insert_into) 1297 elif connexion_format in ["sqlite"]: 1298 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1299 1300 def load_data( 1301 self, 1302 input_file: str = None, 1303 drop_variants_table: bool = False, 1304 sample_size: int = 20480, 1305 ) -> None: 1306 """ 1307 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1308 table before loading the data and specify a sample size. 1309 1310 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1311 table 1312 :type input_file: str 1313 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1314 determines whether the variants table should be dropped before loading the data. If set to 1315 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1316 not be dropped, defaults to False 1317 :type drop_variants_table: bool (optional) 1318 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1319 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1320 20480 1321 :type sample_size: int (optional) 1322 """ 1323 1324 log.info("Loading...") 1325 1326 # change input file 1327 if input_file: 1328 self.set_input(input_file) 1329 self.set_header() 1330 1331 # drop variants table 1332 if drop_variants_table: 1333 self.drop_variants_table() 1334 1335 # get table variants 1336 table_variants = self.get_table_variants() 1337 1338 # Access 1339 access = self.get_config().get("access", None) 1340 log.debug(f"access: {access}") 1341 1342 # Input format and compress 1343 input_format = self.get_input_format() 1344 input_compressed = self.get_input_compressed() 1345 log.debug(f"input_format: {input_format}") 1346 log.debug(f"input_compressed: {input_compressed}") 1347 1348 # input_compressed_format 1349 if input_compressed: 1350 input_compressed_format = "gzip" 1351 else: 1352 input_compressed_format = "none" 1353 log.debug(f"input_compressed_format: {input_compressed_format}") 1354 1355 # Connexion format 1356 connexion_format = self.get_connexion_format() 1357 1358 # Sample size 1359 if not sample_size: 1360 sample_size = -1 1361 log.debug(f"sample_size: {sample_size}") 1362 1363 # Load data 1364 log.debug(f"Load Data from {input_format}") 1365 1366 # DuckDB connexion 1367 if connexion_format in ["duckdb"]: 1368 1369 # Database already exists 1370 if self.input_format in ["db", "duckdb"]: 1371 1372 if connexion_format in ["duckdb"]: 1373 log.debug(f"Input file format '{self.input_format}' duckDB") 1374 else: 1375 log.error( 1376 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1377 ) 1378 raise ValueError( 1379 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1380 ) 1381 1382 # Load from existing database format 1383 else: 1384 1385 try: 1386 # Create Table or View 1387 database = Database(database=self.input) 1388 sql_from = database.get_sql_from(sample_size=sample_size) 1389 1390 if access in ["RO"]: 1391 sql_load = ( 1392 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1393 ) 1394 else: 1395 sql_load = ( 1396 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1397 ) 1398 self.conn.execute(sql_load) 1399 1400 except: 1401 # Format not available 1402 log.error(f"Input file format '{self.input_format}' not available") 1403 raise ValueError( 1404 f"Input file format '{self.input_format}' not available" 1405 ) 1406 1407 # SQLite connexion 1408 elif connexion_format in ["sqlite"] and input_format in [ 1409 "vcf", 1410 "tsv", 1411 "csv", 1412 "psv", 1413 ]: 1414 1415 # Main structure 1416 structure = { 1417 "#CHROM": "VARCHAR", 1418 "POS": "INTEGER", 1419 "ID": "VARCHAR", 1420 "REF": "VARCHAR", 1421 "ALT": "VARCHAR", 1422 "QUAL": "VARCHAR", 1423 "FILTER": "VARCHAR", 1424 "INFO": "VARCHAR", 1425 } 1426 1427 # Strcuture with samples 1428 structure_complete = structure 1429 if self.get_header_sample_list(): 1430 structure["FORMAT"] = "VARCHAR" 1431 for sample in self.get_header_sample_list(): 1432 structure_complete[sample] = "VARCHAR" 1433 1434 # Columns list for create and insert 1435 sql_create_table_columns = [] 1436 sql_create_table_columns_list = [] 1437 for column in structure_complete: 1438 column_type = structure_complete[column] 1439 sql_create_table_columns.append( 1440 f'"{column}" {column_type} default NULL' 1441 ) 1442 sql_create_table_columns_list.append(f'"{column}"') 1443 1444 # Create database 1445 log.debug(f"Create Table {table_variants}") 1446 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1447 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1448 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1449 self.conn.execute(sql_create_table) 1450 1451 # chunksize define length of file chunk load file 1452 chunksize = 100000 1453 1454 # delimiter 1455 delimiter = file_format_delimiters.get(input_format, "\t") 1456 1457 # Load the input file 1458 with open(self.input, "rt") as input_file: 1459 1460 # Use the appropriate file handler based on the input format 1461 if input_compressed: 1462 input_file = bgzf.open(self.input, "rt") 1463 if input_format in ["vcf"]: 1464 header_len = self.get_header_length() 1465 else: 1466 header_len = 0 1467 1468 # Insert the file contents into a table 1469 self.insert_file_to_table( 1470 input_file, 1471 columns=sql_create_table_columns_list_sql, 1472 header_len=header_len, 1473 sep=delimiter, 1474 chunksize=chunksize, 1475 ) 1476 1477 else: 1478 log.error( 1479 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1480 ) 1481 raise ValueError( 1482 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1483 ) 1484 1485 # Explode INFOS fields into table fields 1486 if self.get_explode_infos(): 1487 self.explode_infos( 1488 prefix=self.get_explode_infos_prefix(), 1489 fields=self.get_explode_infos_fields(), 1490 force=True, 1491 ) 1492 1493 # Create index after insertion 1494 self.create_indexes() 1495 1496 def get_explode_infos(self) -> bool: 1497 """ 1498 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1499 to False if it is not set. 1500 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1501 value. If the parameter is not present, it will return False. 1502 """ 1503 1504 return self.get_param().get("explode", {}).get("explode_infos", False) 1505 1506 def get_explode_infos_fields( 1507 self, 1508 explode_infos_fields: str = None, 1509 remove_fields_not_in_header: bool = False, 1510 ) -> list: 1511 """ 1512 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1513 the input parameter `explode_infos_fields`. 1514 1515 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1516 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1517 comma-separated list of field names to explode 1518 :type explode_infos_fields: str 1519 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1520 flag that determines whether to remove fields that are not present in the header. If it is set 1521 to `True`, any field that is not in the header will be excluded from the list of exploded 1522 information fields. If it is set to `, defaults to False 1523 :type remove_fields_not_in_header: bool (optional) 1524 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1525 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1526 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1527 Otherwise, it returns a list of exploded information fields after removing any spaces and 1528 splitting the string by commas. 1529 """ 1530 1531 # If no fields, get it in param 1532 if not explode_infos_fields: 1533 explode_infos_fields = ( 1534 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1535 ) 1536 1537 # If no fields, defined as all fields in header using keyword 1538 if not explode_infos_fields: 1539 explode_infos_fields = "*" 1540 1541 # If fields list not empty 1542 if explode_infos_fields: 1543 1544 # Input fields list 1545 if isinstance(explode_infos_fields, str): 1546 fields_input = explode_infos_fields.split(",") 1547 elif isinstance(explode_infos_fields, list): 1548 fields_input = explode_infos_fields 1549 else: 1550 fields_input = [] 1551 1552 # Fields list without * keyword 1553 fields_without_all = fields_input.copy() 1554 if "*".casefold() in (item.casefold() for item in fields_without_all): 1555 fields_without_all.remove("*") 1556 1557 # Fields in header 1558 fields_in_header = sorted(list(set(self.get_header().infos))) 1559 1560 # Construct list of fields 1561 fields_output = [] 1562 for field in fields_input: 1563 1564 # Strip field 1565 field = field.strip() 1566 1567 # format keyword * in regex 1568 if field.upper() in ["*"]: 1569 field = ".*" 1570 1571 # Find all fields with pattern 1572 r = re.compile(field) 1573 fields_search = sorted(list(filter(r.match, fields_in_header))) 1574 1575 # Remove fields input from search 1576 if field in fields_search: 1577 fields_search = [field] 1578 elif fields_search != [field]: 1579 fields_search = sorted( 1580 list(set(fields_search).difference(fields_input)) 1581 ) 1582 1583 # If field is not in header (avoid not well formatted header) 1584 if not fields_search and not remove_fields_not_in_header: 1585 fields_search = [field] 1586 1587 # Add found fields 1588 for new_field in fields_search: 1589 # Add field, if not already exists, and if it is in header (if asked) 1590 if ( 1591 new_field not in fields_output 1592 and ( 1593 not remove_fields_not_in_header 1594 or new_field in fields_in_header 1595 ) 1596 and new_field not in [".*"] 1597 ): 1598 fields_output.append(new_field) 1599 1600 return fields_output 1601 1602 else: 1603 1604 return [] 1605 1606 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1607 """ 1608 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1609 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1610 not provided. 1611 1612 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1613 prefix to be used for exploding or expanding information 1614 :type explode_infos_prefix: str 1615 :return: the value of the variable `explode_infos_prefix`. 1616 """ 1617 1618 if not explode_infos_prefix: 1619 explode_infos_prefix = ( 1620 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1621 ) 1622 1623 return explode_infos_prefix 1624 1625 def add_column( 1626 self, 1627 table_name, 1628 column_name, 1629 column_type, 1630 default_value=None, 1631 drop: bool = False, 1632 ) -> dict: 1633 """ 1634 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1635 doesn't already exist. 1636 1637 :param table_name: The name of the table to which you want to add a column 1638 :param column_name: The parameter "column_name" is the name of the column that you want to add 1639 to the table 1640 :param column_type: The `column_type` parameter specifies the data type of the column that you 1641 want to add to the table. It should be a string that represents the desired data type, such as 1642 "INTEGER", "TEXT", "REAL", etc 1643 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1644 default value for the newly added column. If a default value is provided, it will be assigned to 1645 the column for any existing rows that do not have a value for that column 1646 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1647 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1648 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1649 to False 1650 :type drop: bool (optional) 1651 :return: a boolean value indicating whether the column was successfully added to the table. 1652 """ 1653 1654 # added 1655 added = False 1656 dropped = False 1657 1658 # Check if the column already exists in the table 1659 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1660 columns = self.get_query_to_df(query).columns.tolist() 1661 if column_name.upper() in [c.upper() for c in columns]: 1662 log.debug( 1663 f"The {column_name} column already exists in the {table_name} table" 1664 ) 1665 if drop: 1666 self.drop_column(table_name=table_name, column_name=column_name) 1667 dropped = True 1668 else: 1669 return None 1670 else: 1671 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1672 1673 # Add column in table 1674 add_column_query = ( 1675 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1676 ) 1677 if default_value is not None: 1678 add_column_query += f" DEFAULT {default_value}" 1679 self.execute_query(add_column_query) 1680 added = not dropped 1681 log.debug( 1682 f"The {column_name} column was successfully added to the {table_name} table" 1683 ) 1684 1685 if added: 1686 added_column = { 1687 "table_name": table_name, 1688 "column_name": column_name, 1689 "column_type": column_type, 1690 "default_value": default_value, 1691 } 1692 else: 1693 added_column = None 1694 1695 return added_column 1696 1697 def drop_column( 1698 self, column: dict = None, table_name: str = None, column_name: str = None 1699 ) -> bool: 1700 """ 1701 The `drop_column` function drops a specified column from a given table in a database and returns 1702 True if the column was successfully dropped, and False if the column does not exist in the 1703 table. 1704 1705 :param column: The `column` parameter is a dictionary that contains information about the column 1706 you want to drop. It has two keys: 1707 :type column: dict 1708 :param table_name: The `table_name` parameter is the name of the table from which you want to 1709 drop a column 1710 :type table_name: str 1711 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1712 from the table 1713 :type column_name: str 1714 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1715 and False if the column does not exist in the table. 1716 """ 1717 1718 # Find column infos 1719 if column: 1720 if isinstance(column, dict): 1721 table_name = column.get("table_name", None) 1722 column_name = column.get("column_name", None) 1723 elif isinstance(column, str): 1724 table_name = self.get_table_variants() 1725 column_name = column 1726 else: 1727 table_name = None 1728 column_name = None 1729 1730 if not table_name and not column_name: 1731 return False 1732 1733 # Removed 1734 removed = False 1735 1736 # Check if the column already exists in the table 1737 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1738 columns = self.get_query_to_df(query).columns.tolist() 1739 if column_name in columns: 1740 log.debug(f"The {column_name} column exists in the {table_name} table") 1741 else: 1742 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1743 return False 1744 1745 # Add column in table # ALTER TABLE integers DROP k 1746 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1747 self.execute_query(add_column_query) 1748 removed = True 1749 log.debug( 1750 f"The {column_name} column was successfully dropped to the {table_name} table" 1751 ) 1752 1753 return removed 1754 1755 def explode_infos( 1756 self, 1757 prefix: str = None, 1758 create_index: bool = False, 1759 fields: list = None, 1760 force: bool = False, 1761 proccess_all_fields_together: bool = False, 1762 table: str = None, 1763 ) -> list: 1764 """ 1765 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1766 individual columns, returning a list of added columns. 1767 1768 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1769 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1770 `self.get_explode_infos_prefix()` as the prefix 1771 :type prefix: str 1772 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1773 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1774 `False`, indexes will not be created. The default value is `False`, defaults to False 1775 :type create_index: bool (optional) 1776 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1777 that you want to explode into individual columns. If this parameter is not provided, all INFO 1778 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1779 a list to the ` 1780 :type fields: list 1781 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1782 determines whether to drop and recreate a column if it already exists in the table. If `force` 1783 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1784 defaults to False 1785 :type force: bool (optional) 1786 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1787 flag that determines whether to process all the INFO fields together or individually. If set to 1788 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1789 be processed individually. The default value is, defaults to False 1790 :type proccess_all_fields_together: bool (optional) 1791 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1792 of the table where the exploded INFO fields will be added as individual columns. If you provide 1793 a value for the `table` parameter, the function will use that table name. If the `table` 1794 parameter is 1795 :type table: str 1796 :return: The `explode_infos` function returns a list of added columns. 1797 """ 1798 1799 # drop indexes 1800 self.drop_indexes() 1801 1802 # connexion format 1803 connexion_format = self.get_connexion_format() 1804 1805 # Access 1806 access = self.get_config().get("access", None) 1807 1808 # Added columns 1809 added_columns = [] 1810 1811 if access not in ["RO"]: 1812 1813 # prefix 1814 if prefix in [None, True] or not isinstance(prefix, str): 1815 if self.get_explode_infos_prefix() not in [None, True]: 1816 prefix = self.get_explode_infos_prefix() 1817 else: 1818 prefix = "INFO/" 1819 1820 # table variants 1821 if table is not None: 1822 table_variants = table 1823 else: 1824 table_variants = self.get_table_variants(clause="select") 1825 1826 # extra infos 1827 try: 1828 extra_infos = self.get_extra_infos() 1829 except: 1830 extra_infos = [] 1831 1832 # Header infos 1833 header_infos = self.get_header().infos 1834 1835 log.debug( 1836 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1837 ) 1838 1839 sql_info_alter_table_array = [] 1840 1841 # Info fields to check 1842 fields_list = list(header_infos) 1843 if fields: 1844 fields_list += fields 1845 fields_list = set(fields_list) 1846 1847 # If no fields 1848 if not fields: 1849 fields = [] 1850 1851 # Translate fields if patterns 1852 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1853 1854 for info in fields: 1855 1856 info_id_sql = prefix + info 1857 1858 if ( 1859 info in fields_list 1860 or prefix + info in fields_list 1861 or info in extra_infos 1862 ): 1863 1864 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1865 1866 if info in header_infos: 1867 info_type = header_infos[info].type 1868 info_num = header_infos[info].num 1869 else: 1870 info_type = "String" 1871 info_num = 0 1872 1873 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1874 if info_num != 1: 1875 type_sql = "VARCHAR" 1876 1877 # Add field 1878 added_column = self.add_column( 1879 table_name=table_variants, 1880 column_name=info_id_sql, 1881 column_type=type_sql, 1882 default_value="null", 1883 drop=force, 1884 ) 1885 1886 if added_column: 1887 added_columns.append(added_column) 1888 1889 if added_column or force: 1890 1891 # add field to index 1892 self.index_additionnal_fields.append(info_id_sql) 1893 1894 # Update field array 1895 if connexion_format in ["duckdb"]: 1896 update_info_field = f""" 1897 "{info_id_sql}" = 1898 CASE 1899 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1900 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1901 END 1902 """ 1903 elif connexion_format in ["sqlite"]: 1904 update_info_field = f""" 1905 "{info_id_sql}" = 1906 CASE 1907 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1908 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1909 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1910 END 1911 """ 1912 1913 sql_info_alter_table_array.append(update_info_field) 1914 1915 if sql_info_alter_table_array: 1916 1917 # By chromosomes 1918 try: 1919 chromosomes_list = list( 1920 self.get_query_to_df( 1921 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1922 )["#CHROM"] 1923 ) 1924 except: 1925 chromosomes_list = [None] 1926 1927 for chrom in chromosomes_list: 1928 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1929 1930 # Where clause 1931 where_clause = "" 1932 if chrom and len(chromosomes_list) > 1: 1933 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1934 1935 # Update table 1936 if proccess_all_fields_together: 1937 sql_info_alter_table_array_join = ", ".join( 1938 sql_info_alter_table_array 1939 ) 1940 if sql_info_alter_table_array_join: 1941 sql_info_alter_table = f""" 1942 UPDATE {table_variants} 1943 SET {sql_info_alter_table_array_join} 1944 {where_clause} 1945 """ 1946 log.debug( 1947 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1948 ) 1949 # log.debug(sql_info_alter_table) 1950 self.conn.execute(sql_info_alter_table) 1951 else: 1952 sql_info_alter_num = 0 1953 for sql_info_alter in sql_info_alter_table_array: 1954 sql_info_alter_num += 1 1955 sql_info_alter_table = f""" 1956 UPDATE {table_variants} 1957 SET {sql_info_alter} 1958 {where_clause} 1959 """ 1960 log.debug( 1961 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1962 ) 1963 # log.debug(sql_info_alter_table) 1964 self.conn.execute(sql_info_alter_table) 1965 1966 # create indexes 1967 if create_index: 1968 self.create_indexes() 1969 1970 return added_columns 1971 1972 def create_indexes(self) -> None: 1973 """ 1974 Create indexes on the table after insertion 1975 """ 1976 1977 # Access 1978 access = self.get_config().get("access", None) 1979 1980 # get table variants 1981 table_variants = self.get_table_variants("FROM") 1982 1983 if self.get_indexing() and access not in ["RO"]: 1984 # Create index 1985 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1986 self.conn.execute(sql_create_table_index) 1987 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1988 self.conn.execute(sql_create_table_index) 1989 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1990 self.conn.execute(sql_create_table_index) 1991 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1992 self.conn.execute(sql_create_table_index) 1993 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1994 self.conn.execute(sql_create_table_index) 1995 for field in self.index_additionnal_fields: 1996 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1997 self.conn.execute(sql_create_table_index) 1998 1999 def drop_indexes(self) -> None: 2000 """ 2001 Create indexes on the table after insertion 2002 """ 2003 2004 # Access 2005 access = self.get_config().get("access", None) 2006 2007 # get table variants 2008 table_variants = self.get_table_variants("FROM") 2009 2010 # Get database format 2011 connexion_format = self.get_connexion_format() 2012 2013 if access not in ["RO"]: 2014 if connexion_format in ["duckdb"]: 2015 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2016 elif connexion_format in ["sqlite"]: 2017 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2018 2019 list_indexes = self.conn.execute(sql_list_indexes) 2020 index_names = [row[0] for row in list_indexes.fetchall()] 2021 for index in index_names: 2022 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2023 self.conn.execute(sql_drop_table_index) 2024 2025 def read_vcf_header(self, f) -> list: 2026 """ 2027 It reads the header of a VCF file and returns a list of the header lines 2028 2029 :param f: the file object 2030 :return: The header lines of the VCF file. 2031 """ 2032 2033 header_list = [] 2034 for line in f: 2035 header_list.append(line) 2036 if line.startswith("#CHROM"): 2037 break 2038 return header_list 2039 2040 def read_vcf_header_file(self, file: str = None) -> list: 2041 """ 2042 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2043 uncompressed files. 2044 2045 :param file: The `file` parameter is a string that represents the path to the VCF header file 2046 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2047 default to `None` 2048 :type file: str 2049 :return: The function `read_vcf_header_file` returns a list. 2050 """ 2051 2052 if self.get_input_compressed(input_file=file): 2053 with bgzf.open(file, "rt") as f: 2054 return self.read_vcf_header(f=f) 2055 else: 2056 with open(file, "rt") as f: 2057 return self.read_vcf_header(f=f) 2058 2059 def execute_query(self, query: str): 2060 """ 2061 It takes a query as an argument, executes it, and returns the results 2062 2063 :param query: The query to be executed 2064 :return: The result of the query is being returned. 2065 """ 2066 if query: 2067 return self.conn.execute(query) # .fetchall() 2068 else: 2069 return None 2070 2071 def export_output( 2072 self, 2073 output_file: str | None = None, 2074 output_header: str | None = None, 2075 export_header: bool = True, 2076 query: str | None = None, 2077 parquet_partitions: list | None = None, 2078 chunk_size: int | None = None, 2079 threads: int | None = None, 2080 sort: bool = False, 2081 index: bool = False, 2082 order_by: str | None = None, 2083 ) -> bool: 2084 """ 2085 The `export_output` function exports data from a VCF file to a specified output file in various 2086 formats, including VCF, CSV, TSV, PSV, and Parquet. 2087 2088 :param output_file: The `output_file` parameter is a string that specifies the name of the 2089 output file to be generated by the function. This is where the exported data will be saved 2090 :type output_file: str 2091 :param output_header: The `output_header` parameter is a string that specifies the name of the 2092 file where the header of the VCF file will be exported. If this parameter is not provided, the 2093 header will be exported to a file with the same name as the `output_file` parameter, but with 2094 the extension " 2095 :type output_header: str 2096 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2097 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2098 True, the header will be exported to a file. If `export_header` is False, the header will not 2099 be, defaults to True, if output format is not VCF 2100 :type export_header: bool (optional) 2101 :param query: The `query` parameter is an optional SQL query that can be used to filter and 2102 select specific data from the VCF file before exporting it. If provided, only the data that 2103 matches the query will be exported 2104 :type query: str 2105 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2106 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2107 organize data in a hierarchical directory structure based on the values of one or more columns. 2108 This can improve query performance when working with large datasets 2109 :type parquet_partitions: list 2110 :param chunk_size: The `chunk_size` parameter specifies the number of 2111 records in batch when exporting data in Parquet format. This parameter is used for 2112 partitioning the Parquet file into multiple files. 2113 :type chunk_size: int 2114 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2115 threads to be used during the export process. It determines the level of parallelism and can 2116 improve the performance of the export operation. If not provided, the function will use the 2117 default number of threads 2118 :type threads: int 2119 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2120 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2121 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2122 False 2123 :type sort: bool (optional) 2124 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2125 created on the output file. If `index` is True, an index will be created. If `index` is False, 2126 no index will be created. The default value is False, defaults to False 2127 :type index: bool (optional) 2128 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2129 sorting the output file. This parameter is only applicable when exporting data in VCF format 2130 :type order_by: str 2131 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2132 None if it doesn't. 2133 """ 2134 2135 # Log 2136 log.info("Exporting...") 2137 2138 # Full path 2139 output_file = full_path(output_file) 2140 output_header = full_path(output_header) 2141 2142 # Config 2143 config = self.get_config() 2144 2145 # Param 2146 param = self.get_param() 2147 2148 # Tmp files to remove 2149 tmp_to_remove = [] 2150 2151 # If no output, get it 2152 if not output_file: 2153 output_file = self.get_output() 2154 2155 # If not threads 2156 if not threads: 2157 threads = self.get_threads() 2158 2159 # Auto header name with extension 2160 if export_header or output_header: 2161 if not output_header: 2162 output_header = f"{output_file}.hdr" 2163 # Export header 2164 self.export_header(output_file=output_file) 2165 2166 # Switch off export header if VCF output 2167 output_file_type = get_file_format(output_file) 2168 if output_file_type in ["vcf"]: 2169 export_header = False 2170 tmp_to_remove.append(output_header) 2171 2172 # Chunk size 2173 if not chunk_size: 2174 chunk_size = config.get("chunk_size", None) 2175 2176 # Parquet partition 2177 if not parquet_partitions: 2178 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2179 if parquet_partitions and isinstance(parquet_partitions, str): 2180 parquet_partitions = parquet_partitions.split(",") 2181 2182 # Order by 2183 if not order_by: 2184 order_by = param.get("export", {}).get("order_by", "") 2185 2186 # Header in output 2187 header_in_output = param.get("export", {}).get("include_header", False) 2188 2189 # Database 2190 database_source = self.get_connexion() 2191 2192 # Connexion format 2193 connexion_format = self.get_connexion_format() 2194 2195 # Explode infos 2196 if self.get_explode_infos(): 2197 self.explode_infos( 2198 prefix=self.get_explode_infos_prefix(), 2199 fields=self.get_explode_infos_fields(), 2200 force=False, 2201 ) 2202 2203 # if connexion_format in ["sqlite"] or query: 2204 if connexion_format in ["sqlite"]: 2205 2206 # Export in Parquet 2207 random_tmp = "".join( 2208 random.choice(string.ascii_lowercase) for i in range(10) 2209 ) 2210 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2211 tmp_to_remove.append(database_source) 2212 2213 # Table Variants 2214 table_variants = self.get_table_variants() 2215 2216 # Create export query 2217 sql_query_export_subquery = f""" 2218 SELECT * FROM {table_variants} 2219 """ 2220 2221 # Write source file 2222 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2223 2224 # Create database 2225 database = Database( 2226 database=database_source, 2227 table="variants", 2228 header_file=output_header, 2229 conn_config=self.get_connexion_config(), 2230 ) 2231 2232 # Existing colomns header 2233 existing_columns_header = database.get_header_columns_from_database() 2234 2235 # Sample list 2236 if output_file_type in ["vcf"]: 2237 get_samples = self.get_samples() 2238 get_samples_check = self.get_samples_check() 2239 samples_force = get_samples is not None 2240 sample_list = self.get_header_sample_list( 2241 check=get_samples_check, 2242 samples=get_samples, 2243 samples_force=samples_force, 2244 ) 2245 else: 2246 sample_list = None 2247 2248 # Export file 2249 database.export( 2250 output_database=output_file, 2251 output_header=output_header, 2252 existing_columns_header=existing_columns_header, 2253 parquet_partitions=parquet_partitions, 2254 chunk_size=chunk_size, 2255 threads=threads, 2256 sort=sort, 2257 index=index, 2258 header_in_output=header_in_output, 2259 order_by=order_by, 2260 query=query, 2261 export_header=export_header, 2262 sample_list=sample_list, 2263 ) 2264 2265 # Remove 2266 remove_if_exists(tmp_to_remove) 2267 2268 return (os.path.exists(output_file) or None) and ( 2269 os.path.exists(output_file) or None 2270 ) 2271 2272 def get_extra_infos(self, table: str = None) -> list: 2273 """ 2274 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2275 in the header. 2276 2277 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2278 name of the table from which you want to retrieve the extra columns that are not present in the 2279 header. If the `table` parameter is not provided when calling the function, it will default to 2280 using the variants 2281 :type table: str 2282 :return: A list of columns that are in the specified table but not in the header of the table. 2283 """ 2284 2285 header_columns = [] 2286 2287 if not table: 2288 table = self.get_table_variants(clause="from") 2289 header_columns = self.get_header_columns() 2290 2291 # Check all columns in the database 2292 query = f""" SELECT * FROM {table} LIMIT 1 """ 2293 log.debug(f"query {query}") 2294 table_columns = self.get_query_to_df(query).columns.tolist() 2295 extra_columns = [] 2296 2297 # Construct extra infos (not in header) 2298 for column in table_columns: 2299 if column not in header_columns: 2300 extra_columns.append(column) 2301 2302 return extra_columns 2303 2304 def get_extra_infos_sql(self, table: str = None) -> str: 2305 """ 2306 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2307 by double quotes 2308 2309 :param table: The name of the table to get the extra infos from. If None, the default table is 2310 used 2311 :type table: str 2312 :return: A string of the extra infos 2313 """ 2314 2315 return ", ".join( 2316 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2317 ) 2318 2319 def export_header( 2320 self, 2321 header_name: str = None, 2322 output_file: str = None, 2323 output_file_ext: str = ".hdr", 2324 clean_header: bool = True, 2325 remove_chrom_line: bool = False, 2326 ) -> str: 2327 """ 2328 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2329 specified options, and writes it to a new file. 2330 2331 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2332 this parameter is not specified, the header will be written to the output file 2333 :type header_name: str 2334 :param output_file: The `output_file` parameter in the `export_header` function is used to 2335 specify the name of the output file where the header will be written. If this parameter is not 2336 provided, the header will be written to a temporary file 2337 :type output_file: str 2338 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2339 string that represents the extension of the output header file. By default, it is set to ".hdr" 2340 if not specified by the user. This extension will be appended to the `output_file` name to 2341 create the final, defaults to .hdr 2342 :type output_file_ext: str (optional) 2343 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2344 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2345 `True`, the function will clean the header by modifying certain lines based on a specific 2346 pattern. If `clean_header`, defaults to True 2347 :type clean_header: bool (optional) 2348 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2349 boolean flag that determines whether the #CHROM line should be removed from the header before 2350 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2351 defaults to False 2352 :type remove_chrom_line: bool (optional) 2353 :return: The function `export_header` returns the name of the temporary header file that is 2354 created. 2355 """ 2356 2357 if not header_name and not output_file: 2358 output_file = self.get_output() 2359 2360 if self.get_header(): 2361 2362 # Get header object 2363 header_obj = self.get_header() 2364 2365 # Create database 2366 db_for_header = Database(database=self.get_input()) 2367 2368 # Get real columns in the file 2369 db_header_columns = db_for_header.get_columns() 2370 2371 with tempfile.TemporaryDirectory() as tmpdir: 2372 2373 # Write header file 2374 header_file_tmp = os.path.join(tmpdir, "header") 2375 f = open(header_file_tmp, "w") 2376 vcf.Writer(f, header_obj) 2377 f.close() 2378 2379 # Replace #CHROM line with rel columns 2380 header_list = db_for_header.read_header_file( 2381 header_file=header_file_tmp 2382 ) 2383 header_list[-1] = "\t".join(db_header_columns) 2384 2385 # Remove CHROM line 2386 if remove_chrom_line: 2387 header_list.pop() 2388 2389 # Clean header 2390 if clean_header: 2391 header_list_clean = [] 2392 for head in header_list: 2393 # Clean head for malformed header 2394 head_clean = head 2395 head_clean = re.subn( 2396 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2397 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2398 head_clean, 2399 2, 2400 )[0] 2401 # Write header 2402 header_list_clean.append(head_clean) 2403 header_list = header_list_clean 2404 2405 tmp_header_name = output_file + output_file_ext 2406 2407 f = open(tmp_header_name, "w") 2408 for line in header_list: 2409 f.write(line) 2410 f.close() 2411 2412 return tmp_header_name 2413 2414 def export_variant_vcf( 2415 self, 2416 vcf_file, 2417 remove_info: bool = False, 2418 add_samples: bool = True, 2419 list_samples: list = [], 2420 where_clause: str = "", 2421 index: bool = False, 2422 threads: int | None = None, 2423 ) -> bool | None: 2424 """ 2425 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2426 remove INFO field, add samples, and control compression and indexing. 2427 2428 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2429 written to. It is the output file that will contain the filtered VCF data based on the specified 2430 parameters 2431 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2432 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2433 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2434 in, defaults to False 2435 :type remove_info: bool (optional) 2436 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2437 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2438 If set to False, the samples will be removed. The default value is True, defaults to True 2439 :type add_samples: bool (optional) 2440 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2441 in the output VCF file. By default, all samples will be included. If you provide a list of 2442 samples, only those samples will be included in the output file 2443 :type list_samples: list 2444 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2445 determines whether or not to create an index for the output VCF file. If `index` is set to 2446 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2447 :type index: bool (optional) 2448 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2449 number of threads to use for exporting the VCF file. It determines how many parallel threads 2450 will be used during the export process. More threads can potentially speed up the export process 2451 by utilizing multiple cores of the processor. If 2452 :type threads: int | None 2453 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2454 method with various parameters including the output file, query, threads, sort flag, and index 2455 flag. The `export_output` method is responsible for exporting the VCF data based on the 2456 specified parameters and configurations provided in the `export_variant_vcf` function. 2457 """ 2458 2459 # Config 2460 config = self.get_config() 2461 2462 # Extract VCF 2463 log.debug("Export VCF...") 2464 2465 # Table variants 2466 table_variants = self.get_table_variants() 2467 2468 # Threads 2469 if not threads: 2470 threads = self.get_threads() 2471 2472 # Info fields 2473 if remove_info: 2474 if not isinstance(remove_info, str): 2475 remove_info = "." 2476 info_field = f"""'{remove_info}' as INFO""" 2477 else: 2478 info_field = "INFO" 2479 2480 # Samples fields 2481 if add_samples: 2482 if not list_samples: 2483 list_samples = self.get_header_sample_list() 2484 if list_samples: 2485 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2486 else: 2487 samples_fields = "" 2488 log.debug(f"samples_fields: {samples_fields}") 2489 else: 2490 samples_fields = "" 2491 2492 # Where clause 2493 if where_clause is None: 2494 where_clause = "" 2495 2496 # Variants 2497 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2498 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2499 log.debug(f"sql_query_select={sql_query_select}") 2500 2501 return self.export_output( 2502 output_file=vcf_file, 2503 output_header=None, 2504 export_header=True, 2505 query=sql_query_select, 2506 parquet_partitions=None, 2507 chunk_size=config.get("chunk_size", None), 2508 threads=threads, 2509 sort=True, 2510 index=index, 2511 order_by=None, 2512 ) 2513 2514 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2515 """ 2516 It takes a list of commands and runs them in parallel using the number of threads specified 2517 2518 :param commands: A list of commands to run 2519 :param threads: The number of threads to use, defaults to 1 (optional) 2520 """ 2521 2522 run_parallel_commands(commands, threads) 2523 2524 def get_threads(self, default: int = 1) -> int: 2525 """ 2526 This function returns the number of threads to use for a job, with a default value of 1 if not 2527 specified. 2528 2529 :param default: The `default` parameter in the `get_threads` method is used to specify the 2530 default number of threads to use if no specific value is provided. If no value is provided for 2531 the `threads` parameter in the configuration or input parameters, the `default` value will be 2532 used, defaults to 1 2533 :type default: int (optional) 2534 :return: the number of threads to use for the current job. 2535 """ 2536 2537 # Config 2538 config = self.get_config() 2539 2540 # Param 2541 param = self.get_param() 2542 2543 # Input threads 2544 input_thread = param.get("threads", config.get("threads", None)) 2545 2546 # Check threads 2547 if not input_thread: 2548 threads = default 2549 elif int(input_thread) <= 0: 2550 threads = os.cpu_count() 2551 else: 2552 threads = int(input_thread) 2553 return threads 2554 2555 def get_memory(self, default: str = None) -> str: 2556 """ 2557 This function retrieves the memory value from parameters or configuration with a default value 2558 if not found. 2559 2560 :param default: The `get_memory` function takes in a default value as a string parameter. This 2561 default value is used as a fallback in case the `memory` parameter is not provided in the 2562 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2563 the function 2564 :type default: str 2565 :return: The `get_memory` function returns a string value representing the memory parameter. If 2566 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2567 return the default value provided as an argument to the function. 2568 """ 2569 2570 # Config 2571 config = self.get_config() 2572 2573 # Param 2574 param = self.get_param() 2575 2576 # Input threads 2577 input_memory = param.get("memory", config.get("memory", None)) 2578 2579 # Check threads 2580 if input_memory: 2581 memory = input_memory 2582 else: 2583 memory = default 2584 2585 return memory 2586 2587 def update_from_vcf(self, vcf_file: str) -> None: 2588 """ 2589 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2590 2591 :param vcf_file: the path to the VCF file 2592 """ 2593 2594 connexion_format = self.get_connexion_format() 2595 2596 if connexion_format in ["duckdb"]: 2597 self.update_from_vcf_duckdb(vcf_file) 2598 elif connexion_format in ["sqlite"]: 2599 self.update_from_vcf_sqlite(vcf_file) 2600 2601 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2602 """ 2603 It takes a VCF file and updates the INFO column of the variants table in the database with the 2604 INFO column of the VCF file 2605 2606 :param vcf_file: the path to the VCF file 2607 """ 2608 2609 # varaints table 2610 table_variants = self.get_table_variants() 2611 2612 # Loading VCF into temporaire table 2613 skip = self.get_header_length(file=vcf_file) 2614 vcf_df = pd.read_csv( 2615 vcf_file, 2616 sep="\t", 2617 engine="c", 2618 skiprows=skip, 2619 header=0, 2620 low_memory=False, 2621 ) 2622 sql_query_update = f""" 2623 UPDATE {table_variants} as table_variants 2624 SET INFO = concat( 2625 CASE 2626 WHEN INFO NOT IN ('', '.') 2627 THEN INFO 2628 ELSE '' 2629 END, 2630 ( 2631 SELECT 2632 concat( 2633 CASE 2634 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2635 THEN ';' 2636 ELSE '' 2637 END 2638 , 2639 CASE 2640 WHEN table_parquet.INFO NOT IN ('','.') 2641 THEN table_parquet.INFO 2642 ELSE '' 2643 END 2644 ) 2645 FROM vcf_df as table_parquet 2646 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2647 AND table_parquet.\"POS\" = table_variants.\"POS\" 2648 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2649 AND table_parquet.\"REF\" = table_variants.\"REF\" 2650 AND table_parquet.INFO NOT IN ('','.') 2651 ) 2652 ) 2653 ; 2654 """ 2655 self.conn.execute(sql_query_update) 2656 2657 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2658 """ 2659 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2660 table, then updates the INFO column of the variants table with the INFO column of the temporary 2661 table 2662 2663 :param vcf_file: The path to the VCF file you want to update the database with 2664 """ 2665 2666 # Create a temporary table for the VCF 2667 table_vcf = "tmp_vcf" 2668 sql_create = ( 2669 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2670 ) 2671 self.conn.execute(sql_create) 2672 2673 # Loading VCF into temporaire table 2674 vcf_df = pd.read_csv( 2675 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2676 ) 2677 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2678 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2679 2680 # Update table 'variants' with VCF data 2681 # warning: CONCAT as || operator 2682 sql_query_update = f""" 2683 UPDATE variants as table_variants 2684 SET INFO = CASE 2685 WHEN INFO NOT IN ('', '.') 2686 THEN INFO 2687 ELSE '' 2688 END || 2689 ( 2690 SELECT 2691 CASE 2692 WHEN table_variants.INFO NOT IN ('','.') 2693 AND table_vcf.INFO NOT IN ('','.') 2694 THEN ';' 2695 ELSE '' 2696 END || 2697 CASE 2698 WHEN table_vcf.INFO NOT IN ('','.') 2699 THEN table_vcf.INFO 2700 ELSE '' 2701 END 2702 FROM {table_vcf} as table_vcf 2703 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2704 AND table_vcf.\"POS\" = table_variants.\"POS\" 2705 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2706 AND table_vcf.\"REF\" = table_variants.\"REF\" 2707 ) 2708 """ 2709 self.conn.execute(sql_query_update) 2710 2711 # Drop temporary table 2712 sql_drop = f"DROP TABLE {table_vcf}" 2713 self.conn.execute(sql_drop) 2714 2715 def drop_variants_table(self) -> None: 2716 """ 2717 > This function drops the variants table 2718 """ 2719 2720 table_variants = self.get_table_variants() 2721 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2722 self.conn.execute(sql_table_variants) 2723 2724 def set_variant_id( 2725 self, variant_id_column: str = "variant_id", force: bool = None 2726 ) -> str: 2727 """ 2728 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2729 `#CHROM`, `POS`, `REF`, and `ALT` columns 2730 2731 :param variant_id_column: The name of the column to be created in the variants table, defaults 2732 to variant_id 2733 :type variant_id_column: str (optional) 2734 :param force: If True, the variant_id column will be created even if it already exists 2735 :type force: bool 2736 :return: The name of the column that contains the variant_id 2737 """ 2738 2739 # Assembly 2740 assembly = self.get_param().get( 2741 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2742 ) 2743 2744 # INFO/Tag prefix 2745 prefix = self.get_explode_infos_prefix() 2746 2747 # Explode INFO/SVTYPE 2748 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2749 2750 # variants table 2751 table_variants = self.get_table_variants() 2752 2753 # variant_id column 2754 if not variant_id_column: 2755 variant_id_column = "variant_id" 2756 2757 # Creta variant_id column 2758 if "variant_id" not in self.get_extra_infos() or force: 2759 2760 # Create column 2761 self.add_column( 2762 table_name=table_variants, 2763 column_name=variant_id_column, 2764 column_type="UBIGINT", 2765 default_value="0", 2766 ) 2767 2768 # Update column 2769 self.conn.execute( 2770 f""" 2771 UPDATE {table_variants} 2772 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2773 """ 2774 ) 2775 2776 # Remove added columns 2777 for added_column in added_columns: 2778 self.drop_column(column=added_column) 2779 2780 # return variant_id column name 2781 return variant_id_column 2782 2783 def get_variant_id_column( 2784 self, variant_id_column: str = "variant_id", force: bool = None 2785 ) -> str: 2786 """ 2787 This function returns the variant_id column name 2788 2789 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2790 defaults to variant_id 2791 :type variant_id_column: str (optional) 2792 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2793 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2794 if it is not already set, or if it is set 2795 :type force: bool 2796 :return: The variant_id column name. 2797 """ 2798 2799 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2800 2801 ### 2802 # Annotation 2803 ### 2804 2805 def scan_databases( 2806 self, 2807 database_formats: list = ["parquet"], 2808 database_releases: list = ["current"], 2809 ) -> dict: 2810 """ 2811 The function `scan_databases` scans for available databases based on specified formats and 2812 releases. 2813 2814 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2815 of the databases to be scanned. In this case, the accepted format is "parquet" 2816 :type database_formats: list ["parquet"] 2817 :param database_releases: The `database_releases` parameter is a list that specifies the 2818 releases of the databases to be scanned. In the provided function, the default value for 2819 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2820 databases that are in the "current" 2821 :type database_releases: list 2822 :return: The function `scan_databases` returns a dictionary containing information about 2823 databases that match the specified formats and releases. 2824 """ 2825 2826 # Config 2827 config = self.get_config() 2828 2829 # Param 2830 param = self.get_param() 2831 2832 # Param - Assembly 2833 assembly = param.get("assembly", config.get("assembly", None)) 2834 if not assembly: 2835 assembly = DEFAULT_ASSEMBLY 2836 log.warning(f"Default assembly '{assembly}'") 2837 2838 # Scan for availabled databases 2839 log.info( 2840 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2841 ) 2842 databases_infos_dict = databases_infos( 2843 database_folder_releases=database_releases, 2844 database_formats=database_formats, 2845 assembly=assembly, 2846 config=config, 2847 ) 2848 log.info( 2849 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2850 ) 2851 2852 return databases_infos_dict 2853 2854 def annotation(self) -> None: 2855 """ 2856 It annotates the VCF file with the annotations specified in the config file. 2857 """ 2858 2859 # Config 2860 config = self.get_config() 2861 2862 # Param 2863 param = self.get_param() 2864 2865 # Param - Assembly 2866 assembly = param.get("assembly", config.get("assembly", None)) 2867 if not assembly: 2868 assembly = DEFAULT_ASSEMBLY 2869 log.warning(f"Default assembly '{assembly}'") 2870 2871 # annotations databases folders 2872 annotations_databases = set( 2873 config.get("folders", {}) 2874 .get("databases", {}) 2875 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2876 + config.get("folders", {}) 2877 .get("databases", {}) 2878 .get("parquet", ["~/howard/databases/parquet/current"]) 2879 + config.get("folders", {}) 2880 .get("databases", {}) 2881 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2882 ) 2883 2884 # Get param annotations 2885 if param.get("annotations", None) and isinstance( 2886 param.get("annotations", None), str 2887 ): 2888 log.debug(param.get("annotations", None)) 2889 param_annotation_list = param.get("annotations").split(",") 2890 else: 2891 param_annotation_list = [] 2892 2893 # Each tools param 2894 if param.get("annotation_parquet", None) != None: 2895 log.debug( 2896 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2897 ) 2898 if isinstance(param.get("annotation_parquet", None), list): 2899 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2900 else: 2901 param_annotation_list.append(param.get("annotation_parquet")) 2902 if param.get("annotation_snpsift", None) != None: 2903 if isinstance(param.get("annotation_snpsift", None), list): 2904 param_annotation_list.append( 2905 "snpsift:" 2906 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2907 ) 2908 else: 2909 param_annotation_list.append( 2910 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2911 ) 2912 if param.get("annotation_snpeff", None) != None: 2913 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2914 if param.get("annotation_bcftools", None) != None: 2915 if isinstance(param.get("annotation_bcftools", None), list): 2916 param_annotation_list.append( 2917 "bcftools:" 2918 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2919 ) 2920 else: 2921 param_annotation_list.append( 2922 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2923 ) 2924 if param.get("annotation_annovar", None) != None: 2925 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2926 if param.get("annotation_exomiser", None) != None: 2927 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2928 if param.get("annotation_splice", None) != None: 2929 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2930 2931 # Merge param annotations list 2932 param["annotations"] = ",".join(param_annotation_list) 2933 2934 # debug 2935 log.debug(f"param_annotations={param['annotations']}") 2936 2937 if param.get("annotations"): 2938 2939 # Log 2940 # log.info("Annotations - Check annotation parameters") 2941 2942 if not "annotation" in param: 2943 param["annotation"] = {} 2944 2945 # List of annotations parameters 2946 annotations_list_input = {} 2947 if isinstance(param.get("annotations", None), str): 2948 annotation_file_list = [ 2949 value for value in param.get("annotations", "").split(",") 2950 ] 2951 for annotation_file in annotation_file_list: 2952 annotations_list_input[annotation_file] = {"INFO": None} 2953 else: 2954 annotations_list_input = param.get("annotations", {}) 2955 2956 log.info(f"Quick Annotations:") 2957 for annotation_key in list(annotations_list_input.keys()): 2958 log.info(f" {annotation_key}") 2959 2960 # List of annotations and associated fields 2961 annotations_list = {} 2962 2963 for annotation_file in annotations_list_input: 2964 2965 # Explode annotations if ALL 2966 if ( 2967 annotation_file.upper() == "ALL" 2968 or annotation_file.upper().startswith("ALL:") 2969 ): 2970 2971 # check ALL parameters (formats, releases) 2972 annotation_file_split = annotation_file.split(":") 2973 database_formats = "parquet" 2974 database_releases = "current" 2975 for annotation_file_option in annotation_file_split[1:]: 2976 database_all_options_split = annotation_file_option.split("=") 2977 if database_all_options_split[0] == "format": 2978 database_formats = database_all_options_split[1].split("+") 2979 if database_all_options_split[0] == "release": 2980 database_releases = database_all_options_split[1].split("+") 2981 2982 # Scan for availabled databases 2983 databases_infos_dict = self.scan_databases( 2984 database_formats=database_formats, 2985 database_releases=database_releases, 2986 ) 2987 2988 # Add found databases in annotation parameters 2989 for database_infos in databases_infos_dict.keys(): 2990 annotations_list[database_infos] = {"INFO": None} 2991 2992 else: 2993 annotations_list[annotation_file] = annotations_list_input[ 2994 annotation_file 2995 ] 2996 2997 # Check each databases 2998 if len(annotations_list): 2999 3000 log.info( 3001 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3002 ) 3003 3004 for annotation_file in annotations_list: 3005 3006 # Init 3007 annotations = annotations_list.get(annotation_file, None) 3008 3009 # Annotation snpEff 3010 if annotation_file.startswith("snpeff"): 3011 3012 log.debug(f"Quick Annotation snpEff") 3013 3014 if "snpeff" not in param["annotation"]: 3015 param["annotation"]["snpeff"] = {} 3016 3017 if "options" not in param["annotation"]["snpeff"]: 3018 param["annotation"]["snpeff"]["options"] = "" 3019 3020 # snpEff options in annotations 3021 param["annotation"]["snpeff"]["options"] = "".join( 3022 annotation_file.split(":")[1:] 3023 ) 3024 3025 # Annotation Annovar 3026 elif annotation_file.startswith("annovar"): 3027 3028 log.debug(f"Quick Annotation Annovar") 3029 3030 if "annovar" not in param["annotation"]: 3031 param["annotation"]["annovar"] = {} 3032 3033 if "annotations" not in param["annotation"]["annovar"]: 3034 param["annotation"]["annovar"]["annotations"] = {} 3035 3036 # Options 3037 annotation_file_split = annotation_file.split(":") 3038 for annotation_file_annotation in annotation_file_split[1:]: 3039 if annotation_file_annotation: 3040 param["annotation"]["annovar"]["annotations"][ 3041 annotation_file_annotation 3042 ] = annotations 3043 3044 # Annotation Exomiser 3045 elif annotation_file.startswith("exomiser"): 3046 3047 log.debug(f"Quick Annotation Exomiser") 3048 3049 param["annotation"]["exomiser"] = params_string_to_dict( 3050 annotation_file 3051 ) 3052 3053 # Annotation Splice 3054 elif annotation_file.startswith("splice"): 3055 3056 log.debug(f"Quick Annotation Splice") 3057 3058 param["annotation"]["splice"] = params_string_to_dict( 3059 annotation_file 3060 ) 3061 3062 # Annotation Parquet or BCFTOOLS 3063 else: 3064 3065 # Tools detection 3066 if annotation_file.startswith("bcftools:"): 3067 annotation_tool_initial = "bcftools" 3068 annotation_file = ":".join(annotation_file.split(":")[1:]) 3069 elif annotation_file.startswith("snpsift:"): 3070 annotation_tool_initial = "snpsift" 3071 annotation_file = ":".join(annotation_file.split(":")[1:]) 3072 else: 3073 annotation_tool_initial = None 3074 3075 # list of files 3076 annotation_file_list = annotation_file.replace("+", ":").split( 3077 ":" 3078 ) 3079 3080 for annotation_file in annotation_file_list: 3081 3082 if annotation_file: 3083 3084 # Annotation tool initial 3085 annotation_tool = annotation_tool_initial 3086 3087 # Find file 3088 annotation_file_found = None 3089 3090 # Expand user 3091 annotation_file = full_path(annotation_file) 3092 3093 if os.path.exists(annotation_file): 3094 annotation_file_found = annotation_file 3095 3096 else: 3097 # Find within assembly folders 3098 for annotations_database in annotations_databases: 3099 found_files = find_all( 3100 annotation_file, 3101 os.path.join( 3102 annotations_database, assembly 3103 ), 3104 ) 3105 if len(found_files) > 0: 3106 annotation_file_found = found_files[0] 3107 break 3108 if not annotation_file_found and not assembly: 3109 # Find within folders 3110 for ( 3111 annotations_database 3112 ) in annotations_databases: 3113 found_files = find_all( 3114 annotation_file, annotations_database 3115 ) 3116 if len(found_files) > 0: 3117 annotation_file_found = found_files[0] 3118 break 3119 log.debug( 3120 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3121 ) 3122 3123 # Full path 3124 annotation_file_found = full_path(annotation_file_found) 3125 3126 if annotation_file_found: 3127 3128 database = Database(database=annotation_file_found) 3129 quick_annotation_format = database.get_format() 3130 quick_annotation_is_compressed = ( 3131 database.is_compressed() 3132 ) 3133 quick_annotation_is_indexed = os.path.exists( 3134 f"{annotation_file_found}.tbi" 3135 ) 3136 bcftools_preference = False 3137 3138 # Check Annotation Tool 3139 if not annotation_tool: 3140 if ( 3141 bcftools_preference 3142 and quick_annotation_format 3143 in ["vcf", "bed"] 3144 and quick_annotation_is_compressed 3145 and quick_annotation_is_indexed 3146 ): 3147 annotation_tool = "bcftools" 3148 elif quick_annotation_format in [ 3149 "vcf", 3150 "bed", 3151 "tsv", 3152 "tsv", 3153 "csv", 3154 "json", 3155 "tbl", 3156 "parquet", 3157 "duckdb", 3158 ]: 3159 annotation_tool = "parquet" 3160 else: 3161 log.error( 3162 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3163 ) 3164 raise ValueError( 3165 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3166 ) 3167 3168 log.debug( 3169 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3170 ) 3171 3172 # Annotation Tool dispatch 3173 if annotation_tool: 3174 if annotation_tool not in param["annotation"]: 3175 param["annotation"][annotation_tool] = {} 3176 if ( 3177 "annotations" 3178 not in param["annotation"][annotation_tool] 3179 ): 3180 param["annotation"][annotation_tool][ 3181 "annotations" 3182 ] = {} 3183 param["annotation"][annotation_tool][ 3184 "annotations" 3185 ][annotation_file_found] = annotations 3186 3187 else: 3188 log.error( 3189 f"Quick Annotation File {annotation_file} does NOT exist" 3190 ) 3191 3192 self.set_param(param) 3193 3194 if param.get("annotation", None): 3195 log.info("Annotations") 3196 if param.get("annotation", {}).get("parquet", None): 3197 log.info("Annotations 'parquet'...") 3198 self.annotation_parquet() 3199 if param.get("annotation", {}).get("bcftools", None): 3200 log.info("Annotations 'bcftools'...") 3201 self.annotation_bcftools() 3202 if param.get("annotation", {}).get("snpsift", None): 3203 log.info("Annotations 'snpsift'...") 3204 self.annotation_snpsift() 3205 if param.get("annotation", {}).get("annovar", None): 3206 log.info("Annotations 'annovar'...") 3207 self.annotation_annovar() 3208 if param.get("annotation", {}).get("snpeff", None): 3209 log.info("Annotations 'snpeff'...") 3210 self.annotation_snpeff() 3211 if param.get("annotation", {}).get("exomiser", None) is not None: 3212 log.info("Annotations 'exomiser'...") 3213 self.annotation_exomiser() 3214 if param.get("annotation", {}).get("splice", None) is not None: 3215 log.info("Annotations 'splice' ...") 3216 self.annotation_splice() 3217 3218 # Explode INFOS fields into table fields 3219 if self.get_explode_infos(): 3220 self.explode_infos( 3221 prefix=self.get_explode_infos_prefix(), 3222 fields=self.get_explode_infos_fields(), 3223 force=True, 3224 ) 3225 3226 def annotation_snpsift(self, threads: int = None) -> None: 3227 """ 3228 This function annotate with bcftools 3229 3230 :param threads: Number of threads to use 3231 :return: the value of the variable "return_value". 3232 """ 3233 3234 # DEBUG 3235 log.debug("Start annotation with bcftools databases") 3236 3237 # Threads 3238 if not threads: 3239 threads = self.get_threads() 3240 log.debug("Threads: " + str(threads)) 3241 3242 # Config 3243 config = self.get_config() 3244 log.debug("Config: " + str(config)) 3245 3246 # Config - snpSift 3247 snpsift_bin_command = get_bin_command( 3248 bin="SnpSift.jar", 3249 tool="snpsift", 3250 bin_type="jar", 3251 config=config, 3252 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3253 ) 3254 if not snpsift_bin_command: 3255 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3256 log.error(msg_err) 3257 raise ValueError(msg_err) 3258 3259 # Config - bcftools 3260 bcftools_bin_command = get_bin_command( 3261 bin="bcftools", 3262 tool="bcftools", 3263 bin_type="bin", 3264 config=config, 3265 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3266 ) 3267 if not bcftools_bin_command: 3268 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3269 log.error(msg_err) 3270 raise ValueError(msg_err) 3271 3272 # Config - BCFTools databases folders 3273 databases_folders = set( 3274 self.get_config() 3275 .get("folders", {}) 3276 .get("databases", {}) 3277 .get("annotations", ["."]) 3278 + self.get_config() 3279 .get("folders", {}) 3280 .get("databases", {}) 3281 .get("bcftools", ["."]) 3282 ) 3283 log.debug("Databases annotations: " + str(databases_folders)) 3284 3285 # Param 3286 annotations = ( 3287 self.get_param() 3288 .get("annotation", {}) 3289 .get("snpsift", {}) 3290 .get("annotations", None) 3291 ) 3292 log.debug("Annotations: " + str(annotations)) 3293 3294 # Assembly 3295 assembly = self.get_param().get( 3296 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3297 ) 3298 3299 # Data 3300 table_variants = self.get_table_variants() 3301 3302 # Check if not empty 3303 log.debug("Check if not empty") 3304 sql_query_chromosomes = ( 3305 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3306 ) 3307 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3308 if not sql_query_chromosomes_df["count"][0]: 3309 log.info(f"VCF empty") 3310 return 3311 3312 # VCF header 3313 vcf_reader = self.get_header() 3314 log.debug("Initial header: " + str(vcf_reader.infos)) 3315 3316 # Existing annotations 3317 for vcf_annotation in self.get_header().infos: 3318 3319 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3320 log.debug( 3321 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3322 ) 3323 3324 if annotations: 3325 3326 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3327 3328 # Export VCF file 3329 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3330 3331 # Init 3332 commands = {} 3333 3334 for annotation in annotations: 3335 annotation_fields = annotations[annotation] 3336 3337 # Annotation Name 3338 annotation_name = os.path.basename(annotation) 3339 3340 if not annotation_fields: 3341 annotation_fields = {"INFO": None} 3342 3343 log.debug(f"Annotation '{annotation_name}'") 3344 log.debug( 3345 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3346 ) 3347 3348 # Create Database 3349 database = Database( 3350 database=annotation, 3351 databases_folders=databases_folders, 3352 assembly=assembly, 3353 ) 3354 3355 # Find files 3356 db_file = database.get_database() 3357 db_file = full_path(db_file) 3358 db_hdr_file = database.get_header_file() 3359 db_hdr_file = full_path(db_hdr_file) 3360 db_file_type = database.get_format() 3361 db_tbi_file = f"{db_file}.tbi" 3362 db_file_compressed = database.is_compressed() 3363 3364 # Check if compressed 3365 if not db_file_compressed: 3366 log.error( 3367 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3368 ) 3369 raise ValueError( 3370 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3371 ) 3372 3373 # Check if indexed 3374 if not os.path.exists(db_tbi_file): 3375 log.error( 3376 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3377 ) 3378 raise ValueError( 3379 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3380 ) 3381 3382 # Check index - try to create if not exists 3383 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3384 log.error("Annotation failed: database not valid") 3385 log.error(f"Annotation annotation file: {db_file}") 3386 log.error(f"Annotation annotation header: {db_hdr_file}") 3387 log.error(f"Annotation annotation index: {db_tbi_file}") 3388 raise ValueError( 3389 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3390 ) 3391 else: 3392 3393 log.debug( 3394 f"Annotation '{annotation}' - file: " 3395 + str(db_file) 3396 + " and " 3397 + str(db_hdr_file) 3398 ) 3399 3400 # Load header as VCF object 3401 db_hdr_vcf = Variants(input=db_hdr_file) 3402 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3403 log.debug( 3404 "Annotation database header: " 3405 + str(db_hdr_vcf_header_infos) 3406 ) 3407 3408 # For all fields in database 3409 annotation_fields_full = False 3410 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3411 annotation_fields = { 3412 key: key for key in db_hdr_vcf_header_infos 3413 } 3414 log.debug( 3415 "Annotation database header - All annotations added: " 3416 + str(annotation_fields) 3417 ) 3418 annotation_fields_full = True 3419 3420 # # Create file for field rename 3421 # log.debug("Create file for field rename") 3422 # tmp_rename = NamedTemporaryFile( 3423 # prefix=self.get_prefix(), 3424 # dir=self.get_tmp_dir(), 3425 # suffix=".rename", 3426 # delete=False, 3427 # ) 3428 # tmp_rename_name = tmp_rename.name 3429 # tmp_files.append(tmp_rename_name) 3430 3431 # Number of fields 3432 nb_annotation_field = 0 3433 annotation_list = [] 3434 annotation_infos_rename_list = [] 3435 3436 for annotation_field in annotation_fields: 3437 3438 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3439 annotation_fields_new_name = annotation_fields.get( 3440 annotation_field, annotation_field 3441 ) 3442 if not annotation_fields_new_name: 3443 annotation_fields_new_name = annotation_field 3444 3445 # Check if field is in DB and if field is not elready in input data 3446 if ( 3447 annotation_field in db_hdr_vcf.get_header().infos 3448 and annotation_fields_new_name 3449 not in self.get_header().infos 3450 ): 3451 3452 log.info( 3453 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3454 ) 3455 3456 # BCFTools annotate param to rename fields 3457 if annotation_field != annotation_fields_new_name: 3458 annotation_infos_rename_list.append( 3459 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3460 ) 3461 3462 # Add INFO field to header 3463 db_hdr_vcf_header_infos_number = ( 3464 db_hdr_vcf_header_infos[annotation_field].num or "." 3465 ) 3466 db_hdr_vcf_header_infos_type = ( 3467 db_hdr_vcf_header_infos[annotation_field].type 3468 or "String" 3469 ) 3470 db_hdr_vcf_header_infos_description = ( 3471 db_hdr_vcf_header_infos[annotation_field].desc 3472 or f"{annotation_field} description" 3473 ) 3474 db_hdr_vcf_header_infos_source = ( 3475 db_hdr_vcf_header_infos[annotation_field].source 3476 or "unknown" 3477 ) 3478 db_hdr_vcf_header_infos_version = ( 3479 db_hdr_vcf_header_infos[annotation_field].version 3480 or "unknown" 3481 ) 3482 3483 vcf_reader.infos[annotation_fields_new_name] = ( 3484 vcf.parser._Info( 3485 annotation_fields_new_name, 3486 db_hdr_vcf_header_infos_number, 3487 db_hdr_vcf_header_infos_type, 3488 db_hdr_vcf_header_infos_description, 3489 db_hdr_vcf_header_infos_source, 3490 db_hdr_vcf_header_infos_version, 3491 self.code_type_map[ 3492 db_hdr_vcf_header_infos_type 3493 ], 3494 ) 3495 ) 3496 3497 annotation_list.append(annotation_field) 3498 3499 nb_annotation_field += 1 3500 3501 else: 3502 3503 if ( 3504 annotation_field 3505 not in db_hdr_vcf.get_header().infos 3506 ): 3507 log.warning( 3508 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3509 ) 3510 if ( 3511 annotation_fields_new_name 3512 in self.get_header().infos 3513 ): 3514 log.warning( 3515 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3516 ) 3517 3518 log.info( 3519 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3520 ) 3521 3522 annotation_infos = ",".join(annotation_list) 3523 3524 if annotation_infos != "": 3525 3526 # Annotated VCF (and error file) 3527 tmp_annotation_vcf_name = os.path.join( 3528 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3529 ) 3530 tmp_annotation_vcf_name_err = ( 3531 tmp_annotation_vcf_name + ".err" 3532 ) 3533 3534 # Add fields to annotate 3535 if not annotation_fields_full: 3536 annotation_infos_option = f"-info {annotation_infos}" 3537 else: 3538 annotation_infos_option = "" 3539 3540 # Info fields rename 3541 if annotation_infos_rename_list: 3542 annotation_infos_rename = " -c " + ",".join( 3543 annotation_infos_rename_list 3544 ) 3545 else: 3546 annotation_infos_rename = "" 3547 3548 # Annotate command 3549 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3550 3551 # Add command 3552 commands[command_annotate] = tmp_annotation_vcf_name 3553 3554 if commands: 3555 3556 # Export VCF file 3557 self.export_variant_vcf( 3558 vcf_file=tmp_vcf_name, 3559 remove_info=True, 3560 add_samples=False, 3561 index=True, 3562 ) 3563 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3564 3565 # Num command 3566 nb_command = 0 3567 3568 # Annotate 3569 for command_annotate in commands: 3570 nb_command += 1 3571 log.info( 3572 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3573 ) 3574 log.debug(f"command_annotate={command_annotate}") 3575 run_parallel_commands([command_annotate], threads) 3576 3577 # Debug 3578 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3579 3580 # Update variants 3581 log.info( 3582 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3583 ) 3584 self.update_from_vcf(commands[command_annotate]) 3585 3586 def annotation_bcftools(self, threads: int = None) -> None: 3587 """ 3588 This function annotate with bcftools 3589 3590 :param threads: Number of threads to use 3591 :return: the value of the variable "return_value". 3592 """ 3593 3594 # DEBUG 3595 log.debug("Start annotation with bcftools databases") 3596 3597 # Threads 3598 if not threads: 3599 threads = self.get_threads() 3600 log.debug("Threads: " + str(threads)) 3601 3602 # Config 3603 config = self.get_config() 3604 log.debug("Config: " + str(config)) 3605 3606 # DEBUG 3607 delete_tmp = True 3608 if self.get_config().get("verbosity", "warning") in ["debug"]: 3609 delete_tmp = False 3610 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3611 3612 # Config - BCFTools bin command 3613 bcftools_bin_command = get_bin_command( 3614 bin="bcftools", 3615 tool="bcftools", 3616 bin_type="bin", 3617 config=config, 3618 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3619 ) 3620 if not bcftools_bin_command: 3621 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3622 log.error(msg_err) 3623 raise ValueError(msg_err) 3624 3625 # Config - BCFTools databases folders 3626 databases_folders = set( 3627 self.get_config() 3628 .get("folders", {}) 3629 .get("databases", {}) 3630 .get("annotations", ["."]) 3631 + self.get_config() 3632 .get("folders", {}) 3633 .get("databases", {}) 3634 .get("bcftools", ["."]) 3635 ) 3636 log.debug("Databases annotations: " + str(databases_folders)) 3637 3638 # Param 3639 annotations = ( 3640 self.get_param() 3641 .get("annotation", {}) 3642 .get("bcftools", {}) 3643 .get("annotations", None) 3644 ) 3645 log.debug("Annotations: " + str(annotations)) 3646 3647 # Assembly 3648 assembly = self.get_param().get( 3649 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3650 ) 3651 3652 # Data 3653 table_variants = self.get_table_variants() 3654 3655 # Check if not empty 3656 log.debug("Check if not empty") 3657 sql_query_chromosomes = ( 3658 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3659 ) 3660 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3661 if not sql_query_chromosomes_df["count"][0]: 3662 log.info(f"VCF empty") 3663 return 3664 3665 # Export in VCF 3666 log.debug("Create initial file to annotate") 3667 tmp_vcf = NamedTemporaryFile( 3668 prefix=self.get_prefix(), 3669 dir=self.get_tmp_dir(), 3670 suffix=".vcf.gz", 3671 delete=False, 3672 ) 3673 tmp_vcf_name = tmp_vcf.name 3674 3675 # VCF header 3676 vcf_reader = self.get_header() 3677 log.debug("Initial header: " + str(vcf_reader.infos)) 3678 3679 # Existing annotations 3680 for vcf_annotation in self.get_header().infos: 3681 3682 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3683 log.debug( 3684 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3685 ) 3686 3687 if annotations: 3688 3689 tmp_ann_vcf_list = [] 3690 commands = [] 3691 tmp_files = [] 3692 err_files = [] 3693 3694 for annotation in annotations: 3695 annotation_fields = annotations[annotation] 3696 3697 # Annotation Name 3698 annotation_name = os.path.basename(annotation) 3699 3700 if not annotation_fields: 3701 annotation_fields = {"INFO": None} 3702 3703 log.debug(f"Annotation '{annotation_name}'") 3704 log.debug( 3705 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3706 ) 3707 3708 # Create Database 3709 database = Database( 3710 database=annotation, 3711 databases_folders=databases_folders, 3712 assembly=assembly, 3713 ) 3714 3715 # Find files 3716 db_file = database.get_database() 3717 db_file = full_path(db_file) 3718 db_hdr_file = database.get_header_file() 3719 db_hdr_file = full_path(db_hdr_file) 3720 db_file_type = database.get_format() 3721 db_tbi_file = f"{db_file}.tbi" 3722 db_file_compressed = database.is_compressed() 3723 3724 # Check if compressed 3725 if not db_file_compressed: 3726 log.error( 3727 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3728 ) 3729 raise ValueError( 3730 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3731 ) 3732 3733 # Check if indexed 3734 if not os.path.exists(db_tbi_file): 3735 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3736 raise ValueError( 3737 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3738 ) 3739 3740 # Check index - try to create if not exists 3741 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3742 log.error("Annotation failed: database not valid") 3743 log.error(f"Annotation annotation file: {db_file}") 3744 log.error(f"Annotation annotation header: {db_hdr_file}") 3745 log.error(f"Annotation annotation index: {db_tbi_file}") 3746 raise ValueError( 3747 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3748 ) 3749 else: 3750 3751 log.debug( 3752 f"Annotation '{annotation}' - file: " 3753 + str(db_file) 3754 + " and " 3755 + str(db_hdr_file) 3756 ) 3757 3758 # Load header as VCF object 3759 db_hdr_vcf = Variants(input=db_hdr_file) 3760 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3761 log.debug( 3762 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3763 ) 3764 3765 # For all fields in database 3766 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3767 annotation_fields = { 3768 key: key for key in db_hdr_vcf_header_infos 3769 } 3770 log.debug( 3771 "Annotation database header - All annotations added: " 3772 + str(annotation_fields) 3773 ) 3774 3775 # Number of fields 3776 nb_annotation_field = 0 3777 annotation_list = [] 3778 3779 for annotation_field in annotation_fields: 3780 3781 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3782 annotation_fields_new_name = annotation_fields.get( 3783 annotation_field, annotation_field 3784 ) 3785 if not annotation_fields_new_name: 3786 annotation_fields_new_name = annotation_field 3787 3788 # Check if field is in DB and if field is not elready in input data 3789 if ( 3790 annotation_field in db_hdr_vcf.get_header().infos 3791 and annotation_fields_new_name 3792 not in self.get_header().infos 3793 ): 3794 3795 log.info( 3796 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3797 ) 3798 3799 # Add INFO field to header 3800 db_hdr_vcf_header_infos_number = ( 3801 db_hdr_vcf_header_infos[annotation_field].num or "." 3802 ) 3803 db_hdr_vcf_header_infos_type = ( 3804 db_hdr_vcf_header_infos[annotation_field].type 3805 or "String" 3806 ) 3807 db_hdr_vcf_header_infos_description = ( 3808 db_hdr_vcf_header_infos[annotation_field].desc 3809 or f"{annotation_field} description" 3810 ) 3811 db_hdr_vcf_header_infos_source = ( 3812 db_hdr_vcf_header_infos[annotation_field].source 3813 or "unknown" 3814 ) 3815 db_hdr_vcf_header_infos_version = ( 3816 db_hdr_vcf_header_infos[annotation_field].version 3817 or "unknown" 3818 ) 3819 3820 vcf_reader.infos[annotation_fields_new_name] = ( 3821 vcf.parser._Info( 3822 annotation_fields_new_name, 3823 db_hdr_vcf_header_infos_number, 3824 db_hdr_vcf_header_infos_type, 3825 db_hdr_vcf_header_infos_description, 3826 db_hdr_vcf_header_infos_source, 3827 db_hdr_vcf_header_infos_version, 3828 self.code_type_map[db_hdr_vcf_header_infos_type], 3829 ) 3830 ) 3831 3832 # annotation_list.append(annotation_field) 3833 if annotation_field != annotation_fields_new_name: 3834 annotation_list.append( 3835 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3836 ) 3837 else: 3838 annotation_list.append(annotation_field) 3839 3840 nb_annotation_field += 1 3841 3842 else: 3843 3844 if annotation_field not in db_hdr_vcf.get_header().infos: 3845 log.warning( 3846 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3847 ) 3848 if annotation_fields_new_name in self.get_header().infos: 3849 log.warning( 3850 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3851 ) 3852 3853 log.info( 3854 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3855 ) 3856 3857 annotation_infos = ",".join(annotation_list) 3858 3859 if annotation_infos != "": 3860 3861 # Protect header for bcftools (remove "#CHROM" and variants line) 3862 log.debug("Protect Header file - remove #CHROM line if exists") 3863 tmp_header_vcf = NamedTemporaryFile( 3864 prefix=self.get_prefix(), 3865 dir=self.get_tmp_dir(), 3866 suffix=".hdr", 3867 delete=False, 3868 ) 3869 tmp_header_vcf_name = tmp_header_vcf.name 3870 tmp_files.append(tmp_header_vcf_name) 3871 # Command 3872 if db_hdr_file.endswith(".gz"): 3873 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3874 else: 3875 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3876 # Run 3877 run_parallel_commands([command_extract_header], 1) 3878 3879 # Find chomosomes 3880 log.debug("Find chromosomes ") 3881 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3882 sql_query_chromosomes_df = self.get_query_to_df( 3883 sql_query_chromosomes 3884 ) 3885 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3886 3887 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3888 3889 # BED columns in the annotation file 3890 if db_file_type in ["bed"]: 3891 annotation_infos = "CHROM,POS,POS," + annotation_infos 3892 3893 for chrom in chomosomes_list: 3894 3895 # Create BED on initial VCF 3896 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3897 tmp_bed = NamedTemporaryFile( 3898 prefix=self.get_prefix(), 3899 dir=self.get_tmp_dir(), 3900 suffix=".bed", 3901 delete=False, 3902 ) 3903 tmp_bed_name = tmp_bed.name 3904 tmp_files.append(tmp_bed_name) 3905 3906 # Detecte regions 3907 log.debug( 3908 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3909 ) 3910 window = 1000000 3911 sql_query_intervals_for_bed = f""" 3912 SELECT \"#CHROM\", 3913 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3914 \"POS\"+{window} 3915 FROM {table_variants} as table_variants 3916 WHERE table_variants.\"#CHROM\" = '{chrom}' 3917 """ 3918 regions = self.conn.execute( 3919 sql_query_intervals_for_bed 3920 ).fetchall() 3921 merged_regions = merge_regions(regions) 3922 log.debug( 3923 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3924 ) 3925 3926 header = ["#CHROM", "START", "END"] 3927 with open(tmp_bed_name, "w") as f: 3928 # Write the header with tab delimiter 3929 f.write("\t".join(header) + "\n") 3930 for d in merged_regions: 3931 # Write each data row with tab delimiter 3932 f.write("\t".join(map(str, d)) + "\n") 3933 3934 # Tmp files 3935 tmp_annotation_vcf = NamedTemporaryFile( 3936 prefix=self.get_prefix(), 3937 dir=self.get_tmp_dir(), 3938 suffix=".vcf.gz", 3939 delete=False, 3940 ) 3941 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3942 tmp_files.append(tmp_annotation_vcf_name) 3943 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3944 tmp_annotation_vcf_name_err = ( 3945 tmp_annotation_vcf_name + ".err" 3946 ) 3947 err_files.append(tmp_annotation_vcf_name_err) 3948 3949 # Annotate Command 3950 log.debug( 3951 f"Annotation '{annotation}' - add bcftools command" 3952 ) 3953 3954 # Command 3955 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3956 3957 # Add command 3958 commands.append(command_annotate) 3959 3960 # if some commands 3961 if commands: 3962 3963 # Export VCF file 3964 self.export_variant_vcf( 3965 vcf_file=tmp_vcf_name, 3966 remove_info=True, 3967 add_samples=False, 3968 index=True, 3969 ) 3970 3971 # Threads 3972 # calculate threads for annotated commands 3973 if commands: 3974 threads_bcftools_annotate = round(threads / len(commands)) 3975 else: 3976 threads_bcftools_annotate = 1 3977 3978 if not threads_bcftools_annotate: 3979 threads_bcftools_annotate = 1 3980 3981 # Add threads option to bcftools commands 3982 if threads_bcftools_annotate > 1: 3983 commands_threaded = [] 3984 for command in commands: 3985 commands_threaded.append( 3986 command.replace( 3987 f"{bcftools_bin_command} annotate ", 3988 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3989 ) 3990 ) 3991 commands = commands_threaded 3992 3993 # Command annotation multithreading 3994 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3995 log.info( 3996 f"Annotation - Annotation multithreaded in " 3997 + str(len(commands)) 3998 + " commands" 3999 ) 4000 4001 run_parallel_commands(commands, threads) 4002 4003 # Merge 4004 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4005 4006 if tmp_ann_vcf_list_cmd: 4007 4008 # Tmp file 4009 tmp_annotate_vcf = NamedTemporaryFile( 4010 prefix=self.get_prefix(), 4011 dir=self.get_tmp_dir(), 4012 suffix=".vcf.gz", 4013 delete=True, 4014 ) 4015 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4016 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4017 err_files.append(tmp_annotate_vcf_name_err) 4018 4019 # Tmp file remove command 4020 tmp_files_remove_command = "" 4021 if tmp_files: 4022 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4023 4024 # Command merge 4025 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4026 log.info( 4027 f"Annotation - Annotation merging " 4028 + str(len(commands)) 4029 + " annotated files" 4030 ) 4031 log.debug(f"Annotation - merge command: {merge_command}") 4032 run_parallel_commands([merge_command], 1) 4033 4034 # Error messages 4035 log.info(f"Error/Warning messages:") 4036 error_message_command_all = [] 4037 error_message_command_warning = [] 4038 error_message_command_err = [] 4039 for err_file in err_files: 4040 with open(err_file, "r") as f: 4041 for line in f: 4042 message = line.strip() 4043 error_message_command_all.append(message) 4044 if line.startswith("[W::"): 4045 error_message_command_warning.append(message) 4046 if line.startswith("[E::"): 4047 error_message_command_err.append( 4048 f"{err_file}: " + message 4049 ) 4050 # log info 4051 for message in list( 4052 set(error_message_command_err + error_message_command_warning) 4053 ): 4054 log.info(f" {message}") 4055 # debug info 4056 for message in list(set(error_message_command_all)): 4057 log.debug(f" {message}") 4058 # failed 4059 if len(error_message_command_err): 4060 log.error("Annotation failed: Error in commands") 4061 raise ValueError("Annotation failed: Error in commands") 4062 4063 # Update variants 4064 log.info(f"Annotation - Updating...") 4065 self.update_from_vcf(tmp_annotate_vcf_name) 4066 4067 def annotation_exomiser(self, threads: int = None) -> None: 4068 """ 4069 This function annotate with Exomiser 4070 4071 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4072 - "analysis" (dict/file): 4073 Full analysis dictionnary parameters (see Exomiser docs). 4074 Either a dict, or a file in JSON or YAML format. 4075 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4076 Default : None 4077 - "preset" (string): 4078 Analysis preset (available in config folder). 4079 Used if no full "analysis" is provided. 4080 Default: "exome" 4081 - "phenopacket" (dict/file): 4082 Samples and phenotipic features parameters (see Exomiser docs). 4083 Either a dict, or a file in JSON or YAML format. 4084 Default: None 4085 - "subject" (dict): 4086 Sample parameters (see Exomiser docs). 4087 Example: 4088 "subject": 4089 { 4090 "id": "ISDBM322017", 4091 "sex": "FEMALE" 4092 } 4093 Default: None 4094 - "sample" (string): 4095 Sample name to construct "subject" section: 4096 "subject": 4097 { 4098 "id": "<sample>", 4099 "sex": "UNKNOWN_SEX" 4100 } 4101 Default: None 4102 - "phenotypicFeatures" (dict) 4103 Phenotypic features to construct "subject" section. 4104 Example: 4105 "phenotypicFeatures": 4106 [ 4107 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4108 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4109 ] 4110 - "hpo" (list) 4111 List of HPO ids as phenotypic features. 4112 Example: 4113 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4114 Default: [] 4115 - "outputOptions" (dict): 4116 Output options (see Exomiser docs). 4117 Default: 4118 "output_options" = 4119 { 4120 "outputContributingVariantsOnly": False, 4121 "numGenes": 0, 4122 "outputFormats": ["TSV_VARIANT", "VCF"] 4123 } 4124 - "transcript_source" (string): 4125 Transcript source (either "refseq", "ucsc", "ensembl") 4126 Default: "refseq" 4127 - "exomiser_to_info" (boolean): 4128 Add exomiser TSV file columns as INFO fields in VCF. 4129 Default: False 4130 - "release" (string): 4131 Exomise database release. 4132 If not exists, database release will be downloaded (take a while). 4133 Default: None (provided by application.properties configuration file) 4134 - "exomiser_application_properties" (file): 4135 Exomiser configuration file (see Exomiser docs). 4136 Useful to automatically download databases (especially for specific genome databases). 4137 4138 Notes: 4139 - If no sample in parameters, first sample in VCF will be chosen 4140 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4141 4142 :param threads: The number of threads to use 4143 :return: None. 4144 """ 4145 4146 # DEBUG 4147 log.debug("Start annotation with Exomiser databases") 4148 4149 # Threads 4150 if not threads: 4151 threads = self.get_threads() 4152 log.debug("Threads: " + str(threads)) 4153 4154 # Config 4155 config = self.get_config() 4156 log.debug("Config: " + str(config)) 4157 4158 # Config - Folders - Databases 4159 databases_folders = ( 4160 config.get("folders", {}) 4161 .get("databases", {}) 4162 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4163 ) 4164 databases_folders = full_path(databases_folders) 4165 if not os.path.exists(databases_folders): 4166 log.error(f"Databases annotations: {databases_folders} NOT found") 4167 log.debug("Databases annotations: " + str(databases_folders)) 4168 4169 # Config - Exomiser 4170 exomiser_bin_command = get_bin_command( 4171 bin="exomiser-cli*.jar", 4172 tool="exomiser", 4173 bin_type="jar", 4174 config=config, 4175 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4176 ) 4177 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4178 if not exomiser_bin_command: 4179 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4180 log.error(msg_err) 4181 raise ValueError(msg_err) 4182 4183 # Param 4184 param = self.get_param() 4185 log.debug("Param: " + str(param)) 4186 4187 # Param - Exomiser 4188 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4189 log.debug(f"Param Exomiser: {param_exomiser}") 4190 4191 # Param - Assembly 4192 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4193 log.debug("Assembly: " + str(assembly)) 4194 4195 # Data 4196 table_variants = self.get_table_variants() 4197 4198 # Check if not empty 4199 log.debug("Check if not empty") 4200 sql_query_chromosomes = ( 4201 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4202 ) 4203 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4204 log.info(f"VCF empty") 4205 return False 4206 4207 # VCF header 4208 vcf_reader = self.get_header() 4209 log.debug("Initial header: " + str(vcf_reader.infos)) 4210 4211 # Samples 4212 samples = self.get_header_sample_list() 4213 if not samples: 4214 log.error("No Samples in VCF") 4215 return False 4216 log.debug(f"Samples: {samples}") 4217 4218 # Memory limit 4219 memory_limit = self.get_memory("8G") 4220 log.debug(f"memory_limit: {memory_limit}") 4221 4222 # Exomiser java options 4223 exomiser_java_options = ( 4224 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4225 ) 4226 log.debug(f"Exomiser java options: {exomiser_java_options}") 4227 4228 # Download Exomiser (if not exists) 4229 exomiser_release = param_exomiser.get("release", None) 4230 exomiser_application_properties = param_exomiser.get( 4231 "exomiser_application_properties", None 4232 ) 4233 databases_download_exomiser( 4234 assemblies=[assembly], 4235 exomiser_folder=databases_folders, 4236 exomiser_release=exomiser_release, 4237 exomiser_phenotype_release=exomiser_release, 4238 exomiser_application_properties=exomiser_application_properties, 4239 ) 4240 4241 # Force annotation 4242 force_update_annotation = True 4243 4244 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4245 log.debug("Start annotation Exomiser") 4246 4247 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4248 4249 # tmp_dir = "/tmp/exomiser" 4250 4251 ### ANALYSIS ### 4252 ################ 4253 4254 # Create analysis.json through analysis dict 4255 # either analysis in param or by default 4256 # depending on preset exome/genome) 4257 4258 # Init analysis dict 4259 param_exomiser_analysis_dict = {} 4260 4261 # analysis from param 4262 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4263 param_exomiser_analysis = full_path(param_exomiser_analysis) 4264 4265 # If analysis in param -> load anlaysis json 4266 if param_exomiser_analysis: 4267 4268 # If param analysis is a file and exists 4269 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4270 param_exomiser_analysis 4271 ): 4272 # Load analysis file into analysis dict (either yaml or json) 4273 with open(param_exomiser_analysis) as json_file: 4274 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4275 4276 # If param analysis is a dict 4277 elif isinstance(param_exomiser_analysis, dict): 4278 # Load analysis dict into analysis dict (either yaml or json) 4279 param_exomiser_analysis_dict = param_exomiser_analysis 4280 4281 # Error analysis type 4282 else: 4283 log.error(f"Analysis type unknown. Check param file.") 4284 raise ValueError(f"Analysis type unknown. Check param file.") 4285 4286 # Case no input analysis config file/dict 4287 # Use preset (exome/genome) to open default config file 4288 if not param_exomiser_analysis_dict: 4289 4290 # default preset 4291 default_preset = "exome" 4292 4293 # Get param preset or default preset 4294 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4295 4296 # Try to find if preset is a file 4297 if os.path.exists(param_exomiser_preset): 4298 # Preset file is provided in full path 4299 param_exomiser_analysis_default_config_file = ( 4300 param_exomiser_preset 4301 ) 4302 # elif os.path.exists(full_path(param_exomiser_preset)): 4303 # # Preset file is provided in full path 4304 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4305 elif os.path.exists( 4306 os.path.join(folder_config, param_exomiser_preset) 4307 ): 4308 # Preset file is provided a basename in config folder (can be a path with subfolders) 4309 param_exomiser_analysis_default_config_file = os.path.join( 4310 folder_config, param_exomiser_preset 4311 ) 4312 else: 4313 # Construct preset file 4314 param_exomiser_analysis_default_config_file = os.path.join( 4315 folder_config, 4316 f"preset-{param_exomiser_preset}-analysis.json", 4317 ) 4318 4319 # If preset file exists 4320 param_exomiser_analysis_default_config_file = full_path( 4321 param_exomiser_analysis_default_config_file 4322 ) 4323 if os.path.exists(param_exomiser_analysis_default_config_file): 4324 # Load prest file into analysis dict (either yaml or json) 4325 with open( 4326 param_exomiser_analysis_default_config_file 4327 ) as json_file: 4328 # param_exomiser_analysis_dict[""] = json.load(json_file) 4329 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4330 json_file 4331 ) 4332 4333 # Error preset file 4334 else: 4335 log.error( 4336 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4337 ) 4338 raise ValueError( 4339 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4340 ) 4341 4342 # If no analysis dict created 4343 if not param_exomiser_analysis_dict: 4344 log.error(f"No analysis config") 4345 raise ValueError(f"No analysis config") 4346 4347 # Log 4348 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4349 4350 ### PHENOPACKET ### 4351 ################### 4352 4353 # If no PhenoPacket in analysis dict -> check in param 4354 if "phenopacket" not in param_exomiser_analysis_dict: 4355 4356 # If PhenoPacket in param -> load anlaysis json 4357 if param_exomiser.get("phenopacket", None): 4358 4359 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4360 param_exomiser_phenopacket = full_path( 4361 param_exomiser_phenopacket 4362 ) 4363 4364 # If param phenopacket is a file and exists 4365 if isinstance( 4366 param_exomiser_phenopacket, str 4367 ) and os.path.exists(param_exomiser_phenopacket): 4368 # Load phenopacket file into analysis dict (either yaml or json) 4369 with open(param_exomiser_phenopacket) as json_file: 4370 param_exomiser_analysis_dict["phenopacket"] = ( 4371 yaml.safe_load(json_file) 4372 ) 4373 4374 # If param phenopacket is a dict 4375 elif isinstance(param_exomiser_phenopacket, dict): 4376 # Load phenopacket dict into analysis dict (either yaml or json) 4377 param_exomiser_analysis_dict["phenopacket"] = ( 4378 param_exomiser_phenopacket 4379 ) 4380 4381 # Error phenopacket type 4382 else: 4383 log.error(f"Phenopacket type unknown. Check param file.") 4384 raise ValueError( 4385 f"Phenopacket type unknown. Check param file." 4386 ) 4387 4388 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4389 if "phenopacket" not in param_exomiser_analysis_dict: 4390 4391 # Init PhenoPacket 4392 param_exomiser_analysis_dict["phenopacket"] = { 4393 "id": "analysis", 4394 "proband": {}, 4395 } 4396 4397 ### Add subject ### 4398 4399 # If subject exists 4400 param_exomiser_subject = param_exomiser.get("subject", {}) 4401 4402 # If subject not exists -> found sample ID 4403 if not param_exomiser_subject: 4404 4405 # Found sample ID in param 4406 sample = param_exomiser.get("sample", None) 4407 4408 # Find sample ID (first sample) 4409 if not sample: 4410 sample_list = self.get_header_sample_list() 4411 if len(sample_list) > 0: 4412 sample = sample_list[0] 4413 else: 4414 log.error(f"No sample found") 4415 raise ValueError(f"No sample found") 4416 4417 # Create subject 4418 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4419 4420 # Add to dict 4421 param_exomiser_analysis_dict["phenopacket"][ 4422 "subject" 4423 ] = param_exomiser_subject 4424 4425 ### Add "phenotypicFeatures" ### 4426 4427 # If phenotypicFeatures exists 4428 param_exomiser_phenotypicfeatures = param_exomiser.get( 4429 "phenotypicFeatures", [] 4430 ) 4431 4432 # If phenotypicFeatures not exists -> Try to infer from hpo list 4433 if not param_exomiser_phenotypicfeatures: 4434 4435 # Found HPO in param 4436 param_exomiser_hpo = param_exomiser.get("hpo", []) 4437 4438 # Split HPO if list in string format separated by comma 4439 if isinstance(param_exomiser_hpo, str): 4440 param_exomiser_hpo = param_exomiser_hpo.split(",") 4441 4442 # Create HPO list 4443 for hpo in param_exomiser_hpo: 4444 hpo_clean = re.sub("[^0-9]", "", hpo) 4445 param_exomiser_phenotypicfeatures.append( 4446 { 4447 "type": { 4448 "id": f"HP:{hpo_clean}", 4449 "label": f"HP:{hpo_clean}", 4450 } 4451 } 4452 ) 4453 4454 # Add to dict 4455 param_exomiser_analysis_dict["phenopacket"][ 4456 "phenotypicFeatures" 4457 ] = param_exomiser_phenotypicfeatures 4458 4459 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4460 if not param_exomiser_phenotypicfeatures: 4461 for step in param_exomiser_analysis_dict.get( 4462 "analysis", {} 4463 ).get("steps", []): 4464 if "hiPhivePrioritiser" in step: 4465 param_exomiser_analysis_dict.get("analysis", {}).get( 4466 "steps", [] 4467 ).remove(step) 4468 4469 ### Add Input File ### 4470 4471 # Initial file name and htsFiles 4472 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4473 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4474 { 4475 "uri": tmp_vcf_name, 4476 "htsFormat": "VCF", 4477 "genomeAssembly": assembly, 4478 } 4479 ] 4480 4481 ### Add metaData ### 4482 4483 # If metaData not in analysis dict 4484 if "metaData" not in param_exomiser_analysis_dict: 4485 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4486 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4487 "createdBy": "howard", 4488 "phenopacketSchemaVersion": 1, 4489 } 4490 4491 ### OutputOptions ### 4492 4493 # Init output result folder 4494 output_results = os.path.join(tmp_dir, "results") 4495 4496 # If no outputOptions in analysis dict 4497 if "outputOptions" not in param_exomiser_analysis_dict: 4498 4499 # default output formats 4500 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4501 4502 # Get outputOptions in param 4503 output_options = param_exomiser.get("outputOptions", None) 4504 4505 # If no output_options in param -> check 4506 if not output_options: 4507 output_options = { 4508 "outputContributingVariantsOnly": False, 4509 "numGenes": 0, 4510 "outputFormats": defaut_output_formats, 4511 } 4512 4513 # Replace outputDirectory in output options 4514 output_options["outputDirectory"] = output_results 4515 output_options["outputFileName"] = "howard" 4516 4517 # Add outputOptions in analysis dict 4518 param_exomiser_analysis_dict["outputOptions"] = output_options 4519 4520 else: 4521 4522 # Replace output_results and output format (if exists in param) 4523 param_exomiser_analysis_dict["outputOptions"][ 4524 "outputDirectory" 4525 ] = output_results 4526 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4527 list( 4528 set( 4529 param_exomiser_analysis_dict.get( 4530 "outputOptions", {} 4531 ).get("outputFormats", []) 4532 + ["TSV_VARIANT", "VCF"] 4533 ) 4534 ) 4535 ) 4536 4537 # log 4538 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4539 4540 ### ANALYSIS FILE ### 4541 ##################### 4542 4543 ### Full JSON analysis config file ### 4544 4545 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4546 with open(exomiser_analysis, "w") as fp: 4547 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4548 4549 ### SPLIT analysis and sample config files 4550 4551 # Splitted analysis dict 4552 param_exomiser_analysis_dict_for_split = ( 4553 param_exomiser_analysis_dict.copy() 4554 ) 4555 4556 # Phenopacket JSON file 4557 exomiser_analysis_phenopacket = os.path.join( 4558 tmp_dir, "analysis_phenopacket.json" 4559 ) 4560 with open(exomiser_analysis_phenopacket, "w") as fp: 4561 json.dump( 4562 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4563 fp, 4564 indent=4, 4565 ) 4566 4567 # Analysis JSON file without Phenopacket parameters 4568 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4569 exomiser_analysis_analysis = os.path.join( 4570 tmp_dir, "analysis_analysis.json" 4571 ) 4572 with open(exomiser_analysis_analysis, "w") as fp: 4573 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4574 4575 ### INITAL VCF file ### 4576 ####################### 4577 4578 ### Create list of samples to use and include inti initial VCF file #### 4579 4580 # Subject (main sample) 4581 # Get sample ID in analysis dict 4582 sample_subject = ( 4583 param_exomiser_analysis_dict.get("phenopacket", {}) 4584 .get("subject", {}) 4585 .get("id", None) 4586 ) 4587 sample_proband = ( 4588 param_exomiser_analysis_dict.get("phenopacket", {}) 4589 .get("proband", {}) 4590 .get("subject", {}) 4591 .get("id", None) 4592 ) 4593 sample = [] 4594 if sample_subject: 4595 sample.append(sample_subject) 4596 if sample_proband: 4597 sample.append(sample_proband) 4598 4599 # Get sample ID within Pedigree 4600 pedigree_persons_list = ( 4601 param_exomiser_analysis_dict.get("phenopacket", {}) 4602 .get("pedigree", {}) 4603 .get("persons", {}) 4604 ) 4605 4606 # Create list with all sample ID in pedigree (if exists) 4607 pedigree_persons = [] 4608 for person in pedigree_persons_list: 4609 pedigree_persons.append(person.get("individualId")) 4610 4611 # Concat subject sample ID and samples ID in pedigreesamples 4612 samples = list(set(sample + pedigree_persons)) 4613 4614 # Check if sample list is not empty 4615 if not samples: 4616 log.error(f"No samples found") 4617 raise ValueError(f"No samples found") 4618 4619 # Create VCF with sample (either sample in param or first one by default) 4620 # Export VCF file 4621 self.export_variant_vcf( 4622 vcf_file=tmp_vcf_name, 4623 remove_info=True, 4624 add_samples=True, 4625 list_samples=samples, 4626 index=False, 4627 ) 4628 4629 ### Execute Exomiser ### 4630 ######################## 4631 4632 # Init command 4633 exomiser_command = "" 4634 4635 # Command exomiser options 4636 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4637 4638 # Release 4639 exomiser_release = param_exomiser.get("release", None) 4640 if exomiser_release: 4641 # phenotype data version 4642 exomiser_options += ( 4643 f" --exomiser.phenotype.data-version={exomiser_release} " 4644 ) 4645 # data version 4646 exomiser_options += ( 4647 f" --exomiser.{assembly}.data-version={exomiser_release} " 4648 ) 4649 # variant white list 4650 variant_white_list_file = ( 4651 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4652 ) 4653 if os.path.exists( 4654 os.path.join( 4655 databases_folders, assembly, variant_white_list_file 4656 ) 4657 ): 4658 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4659 4660 # transcript_source 4661 transcript_source = param_exomiser.get( 4662 "transcript_source", None 4663 ) # ucsc, refseq, ensembl 4664 if transcript_source: 4665 exomiser_options += ( 4666 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4667 ) 4668 4669 # If analysis contain proband param 4670 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4671 "proband", {} 4672 ): 4673 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4674 4675 # If no proband (usually uniq sample) 4676 else: 4677 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4678 4679 # Log 4680 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4681 4682 # Run command 4683 result = subprocess.call( 4684 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4685 ) 4686 if result: 4687 log.error("Exomiser command failed") 4688 raise ValueError("Exomiser command failed") 4689 4690 ### RESULTS ### 4691 ############### 4692 4693 ### Annotate with TSV fields ### 4694 4695 # Init result tsv file 4696 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4697 4698 # Init result tsv file 4699 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4700 4701 # Parse TSV file and explode columns in INFO field 4702 if exomiser_to_info and os.path.exists(output_results_tsv): 4703 4704 # Log 4705 log.debug("Exomiser columns to VCF INFO field") 4706 4707 # Retrieve columns and types 4708 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4709 output_results_tsv_df = self.get_query_to_df(query) 4710 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4711 4712 # Init concat fields for update 4713 sql_query_update_concat_fields = [] 4714 4715 # Fields to avoid 4716 fields_to_avoid = [ 4717 "CONTIG", 4718 "START", 4719 "END", 4720 "REF", 4721 "ALT", 4722 "QUAL", 4723 "FILTER", 4724 "GENOTYPE", 4725 ] 4726 4727 # List all columns to add into header 4728 for header_column in output_results_tsv_columns: 4729 4730 # If header column is enable 4731 if header_column not in fields_to_avoid: 4732 4733 # Header info type 4734 header_info_type = "String" 4735 header_column_df = output_results_tsv_df[header_column] 4736 header_column_df_dtype = header_column_df.dtype 4737 if header_column_df_dtype == object: 4738 if ( 4739 pd.to_numeric(header_column_df, errors="coerce") 4740 .notnull() 4741 .all() 4742 ): 4743 header_info_type = "Float" 4744 else: 4745 header_info_type = "Integer" 4746 4747 # Header info 4748 characters_to_validate = ["-"] 4749 pattern = "[" + "".join(characters_to_validate) + "]" 4750 header_info_name = re.sub( 4751 pattern, 4752 "_", 4753 f"Exomiser_{header_column}".replace("#", ""), 4754 ) 4755 header_info_number = "." 4756 header_info_description = ( 4757 f"Exomiser {header_column} annotation" 4758 ) 4759 header_info_source = "Exomiser" 4760 header_info_version = "unknown" 4761 header_info_code = CODE_TYPE_MAP[header_info_type] 4762 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4763 header_info_name, 4764 header_info_number, 4765 header_info_type, 4766 header_info_description, 4767 header_info_source, 4768 header_info_version, 4769 header_info_code, 4770 ) 4771 4772 # Add field to add for update to concat fields 4773 sql_query_update_concat_fields.append( 4774 f""" 4775 CASE 4776 WHEN table_parquet."{header_column}" NOT IN ('','.') 4777 THEN concat( 4778 '{header_info_name}=', 4779 table_parquet."{header_column}", 4780 ';' 4781 ) 4782 4783 ELSE '' 4784 END 4785 """ 4786 ) 4787 4788 # Update query 4789 sql_query_update = f""" 4790 UPDATE {table_variants} as table_variants 4791 SET INFO = concat( 4792 CASE 4793 WHEN INFO NOT IN ('', '.') 4794 THEN INFO 4795 ELSE '' 4796 END, 4797 CASE 4798 WHEN table_variants.INFO NOT IN ('','.') 4799 THEN ';' 4800 ELSE '' 4801 END, 4802 ( 4803 SELECT 4804 concat( 4805 {",".join(sql_query_update_concat_fields)} 4806 ) 4807 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4808 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4809 AND table_parquet.\"START\" = table_variants.\"POS\" 4810 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4811 AND table_parquet.\"REF\" = table_variants.\"REF\" 4812 ) 4813 ) 4814 ; 4815 """ 4816 4817 # Update 4818 self.conn.execute(sql_query_update) 4819 4820 ### Annotate with VCF INFO field ### 4821 4822 # Init result VCF file 4823 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4824 4825 # If VCF exists 4826 if os.path.exists(output_results_vcf): 4827 4828 # Log 4829 log.debug("Exomiser result VCF update variants") 4830 4831 # Find Exomiser INFO field annotation in header 4832 with gzip.open(output_results_vcf, "rt") as f: 4833 header_list = self.read_vcf_header(f) 4834 exomiser_vcf_header = vcf.Reader( 4835 io.StringIO("\n".join(header_list)) 4836 ) 4837 4838 # Add annotation INFO field to header 4839 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4840 4841 # Update variants with VCF 4842 self.update_from_vcf(output_results_vcf) 4843 4844 return True 4845 4846 def annotation_snpeff(self, threads: int = None) -> None: 4847 """ 4848 This function annotate with snpEff 4849 4850 :param threads: The number of threads to use 4851 :return: the value of the variable "return_value". 4852 """ 4853 4854 # DEBUG 4855 log.debug("Start annotation with snpeff databases") 4856 4857 # Threads 4858 if not threads: 4859 threads = self.get_threads() 4860 log.debug("Threads: " + str(threads)) 4861 4862 # DEBUG 4863 delete_tmp = True 4864 if self.get_config().get("verbosity", "warning") in ["debug"]: 4865 delete_tmp = False 4866 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4867 4868 # Config 4869 config = self.get_config() 4870 log.debug("Config: " + str(config)) 4871 4872 # Config - Folders - Databases 4873 databases_folders = ( 4874 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4875 ) 4876 log.debug("Databases annotations: " + str(databases_folders)) 4877 4878 # # Config - Java 4879 # java_bin = get_bin( 4880 # tool="java", 4881 # bin="java", 4882 # bin_type="bin", 4883 # config=config, 4884 # default_folder="/usr/bin", 4885 # ) 4886 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4887 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4888 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4889 4890 # # Config - snpEff bin 4891 # snpeff_jar = get_bin( 4892 # tool="snpeff", 4893 # bin="snpEff.jar", 4894 # bin_type="jar", 4895 # config=config, 4896 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4897 # ) 4898 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4899 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4900 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4901 4902 # Config - snpEff bin command 4903 snpeff_bin_command = get_bin_command( 4904 bin="snpEff.jar", 4905 tool="snpeff", 4906 bin_type="jar", 4907 config=config, 4908 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4909 ) 4910 if not snpeff_bin_command: 4911 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4912 log.error(msg_err) 4913 raise ValueError(msg_err) 4914 4915 # Config - snpEff databases 4916 snpeff_databases = ( 4917 config.get("folders", {}) 4918 .get("databases", {}) 4919 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4920 ) 4921 snpeff_databases = full_path(snpeff_databases) 4922 if snpeff_databases is not None and snpeff_databases != "": 4923 log.debug(f"Create snpEff databases folder") 4924 if not os.path.exists(snpeff_databases): 4925 os.makedirs(snpeff_databases) 4926 4927 # Param 4928 param = self.get_param() 4929 log.debug("Param: " + str(param)) 4930 4931 # Param 4932 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4933 log.debug("Options: " + str(options)) 4934 4935 # Param - Assembly 4936 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4937 4938 # Param - Options 4939 snpeff_options = ( 4940 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4941 ) 4942 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4943 snpeff_csvstats = ( 4944 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4945 ) 4946 if snpeff_stats: 4947 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4948 snpeff_stats = full_path(snpeff_stats) 4949 snpeff_options += f" -stats {snpeff_stats}" 4950 if snpeff_csvstats: 4951 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4952 snpeff_csvstats = full_path(snpeff_csvstats) 4953 snpeff_options += f" -csvStats {snpeff_csvstats}" 4954 4955 # Data 4956 table_variants = self.get_table_variants() 4957 4958 # Check if not empty 4959 log.debug("Check if not empty") 4960 sql_query_chromosomes = ( 4961 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4962 ) 4963 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4964 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4965 log.info(f"VCF empty") 4966 return 4967 4968 # Export in VCF 4969 log.debug("Create initial file to annotate") 4970 tmp_vcf = NamedTemporaryFile( 4971 prefix=self.get_prefix(), 4972 dir=self.get_tmp_dir(), 4973 suffix=".vcf.gz", 4974 delete=True, 4975 ) 4976 tmp_vcf_name = tmp_vcf.name 4977 4978 # VCF header 4979 vcf_reader = self.get_header() 4980 log.debug("Initial header: " + str(vcf_reader.infos)) 4981 4982 # Existing annotations 4983 for vcf_annotation in self.get_header().infos: 4984 4985 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4986 log.debug( 4987 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4988 ) 4989 4990 # Memory limit 4991 # if config.get("memory", None): 4992 # memory_limit = config.get("memory", "8G") 4993 # else: 4994 # memory_limit = "8G" 4995 memory_limit = self.get_memory("8G") 4996 log.debug(f"memory_limit: {memory_limit}") 4997 4998 # snpEff java options 4999 snpeff_java_options = ( 5000 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5001 ) 5002 log.debug(f"Exomiser java options: {snpeff_java_options}") 5003 5004 force_update_annotation = True 5005 5006 if "ANN" not in self.get_header().infos or force_update_annotation: 5007 5008 # Check snpEff database 5009 log.debug(f"Check snpEff databases {[assembly]}") 5010 databases_download_snpeff( 5011 folder=snpeff_databases, assemblies=[assembly], config=config 5012 ) 5013 5014 # Export VCF file 5015 self.export_variant_vcf( 5016 vcf_file=tmp_vcf_name, 5017 remove_info=True, 5018 add_samples=False, 5019 index=True, 5020 ) 5021 5022 # Tmp file 5023 err_files = [] 5024 tmp_annotate_vcf = NamedTemporaryFile( 5025 prefix=self.get_prefix(), 5026 dir=self.get_tmp_dir(), 5027 suffix=".vcf", 5028 delete=False, 5029 ) 5030 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5031 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5032 err_files.append(tmp_annotate_vcf_name_err) 5033 5034 # Command 5035 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5036 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5037 run_parallel_commands([snpeff_command], 1) 5038 5039 # Error messages 5040 log.info(f"Error/Warning messages:") 5041 error_message_command_all = [] 5042 error_message_command_warning = [] 5043 error_message_command_err = [] 5044 for err_file in err_files: 5045 with open(err_file, "r") as f: 5046 for line in f: 5047 message = line.strip() 5048 error_message_command_all.append(message) 5049 if line.startswith("[W::"): 5050 error_message_command_warning.append(message) 5051 if line.startswith("[E::"): 5052 error_message_command_err.append(f"{err_file}: " + message) 5053 # log info 5054 for message in list( 5055 set(error_message_command_err + error_message_command_warning) 5056 ): 5057 log.info(f" {message}") 5058 # debug info 5059 for message in list(set(error_message_command_all)): 5060 log.debug(f" {message}") 5061 # failed 5062 if len(error_message_command_err): 5063 log.error("Annotation failed: Error in commands") 5064 raise ValueError("Annotation failed: Error in commands") 5065 5066 # Find annotation in header 5067 with open(tmp_annotate_vcf_name, "rt") as f: 5068 header_list = self.read_vcf_header(f) 5069 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5070 5071 for ann in annovar_vcf_header.infos: 5072 if ann not in self.get_header().infos: 5073 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5074 5075 # Update variants 5076 log.info(f"Annotation - Updating...") 5077 self.update_from_vcf(tmp_annotate_vcf_name) 5078 5079 else: 5080 if "ANN" in self.get_header().infos: 5081 log.debug(f"Existing snpEff annotations in VCF") 5082 if force_update_annotation: 5083 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5084 5085 def annotation_annovar(self, threads: int = None) -> None: 5086 """ 5087 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5088 annotations 5089 5090 :param threads: number of threads to use 5091 :return: the value of the variable "return_value". 5092 """ 5093 5094 # DEBUG 5095 log.debug("Start annotation with Annovar databases") 5096 5097 # Threads 5098 if not threads: 5099 threads = self.get_threads() 5100 log.debug("Threads: " + str(threads)) 5101 5102 # Tmp en Err files 5103 tmp_files = [] 5104 err_files = [] 5105 5106 # DEBUG 5107 delete_tmp = True 5108 if self.get_config().get("verbosity", "warning") in ["debug"]: 5109 delete_tmp = False 5110 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5111 5112 # Config 5113 config = self.get_config() 5114 log.debug("Config: " + str(config)) 5115 5116 # Config - Folders - Databases 5117 databases_folders = ( 5118 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5119 ) 5120 log.debug("Databases annotations: " + str(databases_folders)) 5121 5122 # Config - annovar bin command 5123 annovar_bin_command = get_bin_command( 5124 bin="table_annovar.pl", 5125 tool="annovar", 5126 bin_type="perl", 5127 config=config, 5128 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5129 ) 5130 if not annovar_bin_command: 5131 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5132 log.error(msg_err) 5133 raise ValueError(msg_err) 5134 5135 # Config - BCFTools bin command 5136 bcftools_bin_command = get_bin_command( 5137 bin="bcftools", 5138 tool="bcftools", 5139 bin_type="bin", 5140 config=config, 5141 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5142 ) 5143 if not bcftools_bin_command: 5144 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5145 log.error(msg_err) 5146 raise ValueError(msg_err) 5147 5148 # Config - annovar databases 5149 annovar_databases = ( 5150 config.get("folders", {}) 5151 .get("databases", {}) 5152 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5153 ) 5154 annovar_databases = full_path(annovar_databases) 5155 if annovar_databases != "" and not os.path.exists(annovar_databases): 5156 os.makedirs(annovar_databases) 5157 5158 # Param 5159 param = self.get_param() 5160 log.debug("Param: " + str(param)) 5161 5162 # Param - options 5163 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5164 log.debug("Options: " + str(options)) 5165 5166 # Param - annotations 5167 annotations = ( 5168 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5169 ) 5170 log.debug("Annotations: " + str(annotations)) 5171 5172 # Param - Assembly 5173 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5174 5175 # Annovar database assembly 5176 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5177 if annovar_databases_assembly != "" and not os.path.exists( 5178 annovar_databases_assembly 5179 ): 5180 os.makedirs(annovar_databases_assembly) 5181 5182 # Data 5183 table_variants = self.get_table_variants() 5184 5185 # Check if not empty 5186 log.debug("Check if not empty") 5187 sql_query_chromosomes = ( 5188 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5189 ) 5190 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5191 if not sql_query_chromosomes_df["count"][0]: 5192 log.info(f"VCF empty") 5193 return 5194 5195 # VCF header 5196 vcf_reader = self.get_header() 5197 log.debug("Initial header: " + str(vcf_reader.infos)) 5198 5199 # Existing annotations 5200 for vcf_annotation in self.get_header().infos: 5201 5202 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5203 log.debug( 5204 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5205 ) 5206 5207 force_update_annotation = True 5208 5209 if annotations: 5210 5211 commands = [] 5212 tmp_annotates_vcf_name_list = [] 5213 5214 # Export in VCF 5215 log.debug("Create initial file to annotate") 5216 tmp_vcf = NamedTemporaryFile( 5217 prefix=self.get_prefix(), 5218 dir=self.get_tmp_dir(), 5219 suffix=".vcf.gz", 5220 delete=False, 5221 ) 5222 tmp_vcf_name = tmp_vcf.name 5223 tmp_files.append(tmp_vcf_name) 5224 tmp_files.append(tmp_vcf_name + ".tbi") 5225 5226 # Export VCF file 5227 self.export_variant_vcf( 5228 vcf_file=tmp_vcf_name, 5229 remove_info=".", 5230 add_samples=False, 5231 index=True, 5232 ) 5233 5234 # Create file for field rename 5235 log.debug("Create file for field rename") 5236 tmp_rename = NamedTemporaryFile( 5237 prefix=self.get_prefix(), 5238 dir=self.get_tmp_dir(), 5239 suffix=".rename", 5240 delete=False, 5241 ) 5242 tmp_rename_name = tmp_rename.name 5243 tmp_files.append(tmp_rename_name) 5244 5245 # Check Annovar database 5246 log.debug( 5247 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5248 ) 5249 databases_download_annovar( 5250 folder=annovar_databases, 5251 files=list(annotations.keys()), 5252 assemblies=[assembly], 5253 ) 5254 5255 for annotation in annotations: 5256 annotation_fields = annotations[annotation] 5257 5258 if not annotation_fields: 5259 annotation_fields = {"INFO": None} 5260 5261 log.info(f"Annotations Annovar - database '{annotation}'") 5262 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5263 5264 # Tmp file for annovar 5265 err_files = [] 5266 tmp_annotate_vcf_directory = TemporaryDirectory( 5267 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5268 ) 5269 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5270 tmp_annotate_vcf_name_annovar = ( 5271 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5272 ) 5273 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5274 err_files.append(tmp_annotate_vcf_name_err) 5275 tmp_files.append(tmp_annotate_vcf_name_err) 5276 5277 # Tmp file final vcf annotated by annovar 5278 tmp_annotate_vcf = NamedTemporaryFile( 5279 prefix=self.get_prefix(), 5280 dir=self.get_tmp_dir(), 5281 suffix=".vcf.gz", 5282 delete=False, 5283 ) 5284 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5285 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5286 tmp_files.append(tmp_annotate_vcf_name) 5287 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5288 5289 # Number of fields 5290 annotation_list = [] 5291 annotation_renamed_list = [] 5292 5293 for annotation_field in annotation_fields: 5294 5295 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5296 annotation_fields_new_name = annotation_fields.get( 5297 annotation_field, annotation_field 5298 ) 5299 if not annotation_fields_new_name: 5300 annotation_fields_new_name = annotation_field 5301 5302 if ( 5303 force_update_annotation 5304 or annotation_fields_new_name not in self.get_header().infos 5305 ): 5306 annotation_list.append(annotation_field) 5307 annotation_renamed_list.append(annotation_fields_new_name) 5308 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5309 log.warning( 5310 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5311 ) 5312 5313 # Add rename info 5314 run_parallel_commands( 5315 [ 5316 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5317 ], 5318 1, 5319 ) 5320 5321 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5322 log.debug("annotation_list: " + str(annotation_list)) 5323 5324 # protocol 5325 protocol = annotation 5326 5327 # argument 5328 argument = "" 5329 5330 # operation 5331 operation = "f" 5332 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5333 "ensGene" 5334 ): 5335 operation = "g" 5336 if options.get("genebase", None): 5337 argument = f"""'{options.get("genebase","")}'""" 5338 elif annotation in ["cytoBand"]: 5339 operation = "r" 5340 5341 # argument option 5342 argument_option = "" 5343 if argument != "": 5344 argument_option = " --argument " + argument 5345 5346 # command options 5347 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5348 for option in options: 5349 if option not in ["genebase"]: 5350 command_options += f""" --{option}={options[option]}""" 5351 5352 # Command 5353 5354 # Command - Annovar 5355 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5356 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5357 5358 # Command - start pipe 5359 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5360 5361 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5362 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5363 5364 # Command - Special characters (refGene annotation) 5365 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5366 5367 # Command - Clean empty fields (with value ".") 5368 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5369 5370 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5371 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5372 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5373 # for ann in annotation_renamed_list: 5374 for ann in annotation_list: 5375 annovar_fields_to_keep.append(f"^INFO/{ann}") 5376 5377 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5378 5379 # Command - indexing 5380 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5381 5382 log.debug(f"Annotation - Annovar command: {command_annovar}") 5383 run_parallel_commands([command_annovar], 1) 5384 5385 # Error messages 5386 log.info(f"Error/Warning messages:") 5387 error_message_command_all = [] 5388 error_message_command_warning = [] 5389 error_message_command_err = [] 5390 for err_file in err_files: 5391 with open(err_file, "r") as f: 5392 for line in f: 5393 message = line.strip() 5394 error_message_command_all.append(message) 5395 if line.startswith("[W::") or line.startswith("WARNING"): 5396 error_message_command_warning.append(message) 5397 if line.startswith("[E::") or line.startswith("ERROR"): 5398 error_message_command_err.append( 5399 f"{err_file}: " + message 5400 ) 5401 # log info 5402 for message in list( 5403 set(error_message_command_err + error_message_command_warning) 5404 ): 5405 log.info(f" {message}") 5406 # debug info 5407 for message in list(set(error_message_command_all)): 5408 log.debug(f" {message}") 5409 # failed 5410 if len(error_message_command_err): 5411 log.error("Annotation failed: Error in commands") 5412 raise ValueError("Annotation failed: Error in commands") 5413 5414 if tmp_annotates_vcf_name_list: 5415 5416 # List of annotated files 5417 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5418 5419 # Tmp file 5420 tmp_annotate_vcf = NamedTemporaryFile( 5421 prefix=self.get_prefix(), 5422 dir=self.get_tmp_dir(), 5423 suffix=".vcf.gz", 5424 delete=False, 5425 ) 5426 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5427 tmp_files.append(tmp_annotate_vcf_name) 5428 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5429 err_files.append(tmp_annotate_vcf_name_err) 5430 tmp_files.append(tmp_annotate_vcf_name_err) 5431 5432 # Command merge 5433 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5434 log.info( 5435 f"Annotation Annovar - Annotation merging " 5436 + str(len(tmp_annotates_vcf_name_list)) 5437 + " annotated files" 5438 ) 5439 log.debug(f"Annotation - merge command: {merge_command}") 5440 run_parallel_commands([merge_command], 1) 5441 5442 # Find annotation in header 5443 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5444 header_list = self.read_vcf_header(f) 5445 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5446 5447 for ann in annovar_vcf_header.infos: 5448 if ann not in self.get_header().infos: 5449 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5450 5451 # Update variants 5452 log.info(f"Annotation Annovar - Updating...") 5453 self.update_from_vcf(tmp_annotate_vcf_name) 5454 5455 # Clean files 5456 # Tmp file remove command 5457 if True: 5458 tmp_files_remove_command = "" 5459 if tmp_files: 5460 tmp_files_remove_command = " ".join(tmp_files) 5461 clean_command = f" rm -f {tmp_files_remove_command} " 5462 log.debug(f"Annotation Annovar - Annotation cleaning ") 5463 log.debug(f"Annotation - cleaning command: {clean_command}") 5464 run_parallel_commands([clean_command], 1) 5465 5466 # Parquet 5467 def annotation_parquet(self, threads: int = None) -> None: 5468 """ 5469 It takes a VCF file, and annotates it with a parquet file 5470 5471 :param threads: number of threads to use for the annotation 5472 :return: the value of the variable "result". 5473 """ 5474 5475 # DEBUG 5476 log.debug("Start annotation with parquet databases") 5477 5478 # Threads 5479 if not threads: 5480 threads = self.get_threads() 5481 log.debug("Threads: " + str(threads)) 5482 5483 # DEBUG 5484 delete_tmp = True 5485 if self.get_config().get("verbosity", "warning") in ["debug"]: 5486 delete_tmp = False 5487 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5488 5489 # Config 5490 databases_folders = set( 5491 self.get_config() 5492 .get("folders", {}) 5493 .get("databases", {}) 5494 .get("annotations", ["."]) 5495 + self.get_config() 5496 .get("folders", {}) 5497 .get("databases", {}) 5498 .get("parquet", ["."]) 5499 ) 5500 log.debug("Databases annotations: " + str(databases_folders)) 5501 5502 # Param 5503 annotations = ( 5504 self.get_param() 5505 .get("annotation", {}) 5506 .get("parquet", {}) 5507 .get("annotations", None) 5508 ) 5509 log.debug("Annotations: " + str(annotations)) 5510 5511 # Assembly 5512 assembly = self.get_param().get( 5513 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5514 ) 5515 5516 # Force Update Annotation 5517 force_update_annotation = ( 5518 self.get_param() 5519 .get("annotation", {}) 5520 .get("options", {}) 5521 .get("annotations_update", False) 5522 ) 5523 log.debug(f"force_update_annotation={force_update_annotation}") 5524 force_append_annotation = ( 5525 self.get_param() 5526 .get("annotation", {}) 5527 .get("options", {}) 5528 .get("annotations_append", False) 5529 ) 5530 log.debug(f"force_append_annotation={force_append_annotation}") 5531 5532 # Data 5533 table_variants = self.get_table_variants() 5534 5535 # Check if not empty 5536 log.debug("Check if not empty") 5537 sql_query_chromosomes_df = self.get_query_to_df( 5538 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5539 ) 5540 if not sql_query_chromosomes_df["count"][0]: 5541 log.info(f"VCF empty") 5542 return 5543 5544 # VCF header 5545 vcf_reader = self.get_header() 5546 log.debug("Initial header: " + str(vcf_reader.infos)) 5547 5548 # Nb Variants POS 5549 log.debug("NB Variants Start") 5550 nb_variants = self.conn.execute( 5551 f"SELECT count(*) AS count FROM variants" 5552 ).fetchdf()["count"][0] 5553 log.debug("NB Variants Stop") 5554 5555 # Existing annotations 5556 for vcf_annotation in self.get_header().infos: 5557 5558 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5559 log.debug( 5560 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5561 ) 5562 5563 # Added columns 5564 added_columns = [] 5565 5566 # drop indexes 5567 log.debug(f"Drop indexes...") 5568 self.drop_indexes() 5569 5570 if annotations: 5571 5572 if "ALL" in annotations: 5573 5574 all_param = annotations.get("ALL", {}) 5575 all_param_formats = all_param.get("formats", None) 5576 all_param_releases = all_param.get("releases", None) 5577 5578 databases_infos_dict = self.scan_databases( 5579 database_formats=all_param_formats, 5580 database_releases=all_param_releases, 5581 ) 5582 for database_infos in databases_infos_dict.keys(): 5583 if database_infos not in annotations: 5584 annotations[database_infos] = {"INFO": None} 5585 5586 for annotation in annotations: 5587 5588 if annotation in ["ALL"]: 5589 continue 5590 5591 # Annotation Name 5592 annotation_name = os.path.basename(annotation) 5593 5594 # Annotation fields 5595 annotation_fields = annotations[annotation] 5596 if not annotation_fields: 5597 annotation_fields = {"INFO": None} 5598 5599 log.debug(f"Annotation '{annotation_name}'") 5600 log.debug( 5601 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5602 ) 5603 5604 # Create Database 5605 database = Database( 5606 database=annotation, 5607 databases_folders=databases_folders, 5608 assembly=assembly, 5609 ) 5610 5611 # Find files 5612 parquet_file = database.get_database() 5613 parquet_hdr_file = database.get_header_file() 5614 parquet_type = database.get_type() 5615 5616 # Check if files exists 5617 if not parquet_file or not parquet_hdr_file: 5618 log.error("Annotation failed: file not found") 5619 raise ValueError("Annotation failed: file not found") 5620 else: 5621 # Get parquet connexion 5622 parquet_sql_attach = database.get_sql_database_attach( 5623 output="query" 5624 ) 5625 if parquet_sql_attach: 5626 self.conn.execute(parquet_sql_attach) 5627 parquet_file_link = database.get_sql_database_link() 5628 # Log 5629 log.debug( 5630 f"Annotation '{annotation_name}' - file: " 5631 + str(parquet_file) 5632 + " and " 5633 + str(parquet_hdr_file) 5634 ) 5635 5636 # Database full header columns 5637 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5638 parquet_hdr_file 5639 ) 5640 # Log 5641 log.debug( 5642 "Annotation database header columns : " 5643 + str(parquet_hdr_vcf_header_columns) 5644 ) 5645 5646 # Load header as VCF object 5647 parquet_hdr_vcf_header_infos = database.get_header().infos 5648 # Log 5649 log.debug( 5650 "Annotation database header: " 5651 + str(parquet_hdr_vcf_header_infos) 5652 ) 5653 5654 # Get extra infos 5655 parquet_columns = database.get_extra_columns() 5656 # Log 5657 log.debug("Annotation database Columns: " + str(parquet_columns)) 5658 5659 # Add extra columns if "ALL" in annotation_fields 5660 # if "ALL" in annotation_fields: 5661 # allow_add_extra_column = True 5662 if "ALL" in annotation_fields and database.get_extra_columns(): 5663 for extra_column in database.get_extra_columns(): 5664 if ( 5665 extra_column not in annotation_fields 5666 and extra_column.replace("INFO/", "") 5667 not in parquet_hdr_vcf_header_infos 5668 ): 5669 parquet_hdr_vcf_header_infos[extra_column] = ( 5670 vcf.parser._Info( 5671 extra_column, 5672 ".", 5673 "String", 5674 f"{extra_column} description", 5675 "unknown", 5676 "unknown", 5677 self.code_type_map["String"], 5678 ) 5679 ) 5680 5681 # For all fields in database 5682 annotation_fields_all = False 5683 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5684 annotation_fields_all = True 5685 annotation_fields = { 5686 key: key for key in parquet_hdr_vcf_header_infos 5687 } 5688 5689 log.debug( 5690 "Annotation database header - All annotations added: " 5691 + str(annotation_fields) 5692 ) 5693 5694 # Init 5695 5696 # List of annotation fields to use 5697 sql_query_annotation_update_info_sets = [] 5698 5699 # List of annotation to agregate 5700 sql_query_annotation_to_agregate = [] 5701 5702 # Number of fields 5703 nb_annotation_field = 0 5704 5705 # Annotation fields processed 5706 annotation_fields_processed = [] 5707 5708 # Columns mapping 5709 map_columns = database.map_columns( 5710 columns=annotation_fields, prefixes=["INFO/"] 5711 ) 5712 5713 # Query dict for fields to remove (update option) 5714 query_dict_remove = {} 5715 5716 # Fetch Anotation fields 5717 for annotation_field in annotation_fields: 5718 5719 # annotation_field_column 5720 annotation_field_column = map_columns.get( 5721 annotation_field, "INFO" 5722 ) 5723 5724 # field new name, if parametered 5725 annotation_fields_new_name = annotation_fields.get( 5726 annotation_field, annotation_field 5727 ) 5728 if not annotation_fields_new_name: 5729 annotation_fields_new_name = annotation_field 5730 5731 # To annotate 5732 # force_update_annotation = True 5733 # force_append_annotation = True 5734 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5735 if annotation_field in parquet_hdr_vcf_header_infos and ( 5736 force_update_annotation 5737 or force_append_annotation 5738 or ( 5739 annotation_fields_new_name 5740 not in self.get_header().infos 5741 ) 5742 ): 5743 5744 # Add field to annotation to process list 5745 annotation_fields_processed.append( 5746 annotation_fields_new_name 5747 ) 5748 5749 # explode infos for the field 5750 annotation_fields_new_name_info_msg = "" 5751 if ( 5752 force_update_annotation 5753 and annotation_fields_new_name 5754 in self.get_header().infos 5755 ): 5756 # Remove field from INFO 5757 query = f""" 5758 UPDATE {table_variants} as table_variants 5759 SET INFO = REGEXP_REPLACE( 5760 concat(table_variants.INFO,''), 5761 ';*{annotation_fields_new_name}=[^;]*', 5762 '' 5763 ) 5764 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5765 """ 5766 annotation_fields_new_name_info_msg = " [update]" 5767 query_dict_remove[ 5768 f"remove 'INFO/{annotation_fields_new_name}'" 5769 ] = query 5770 5771 # Sep between fields in INFO 5772 nb_annotation_field += 1 5773 if nb_annotation_field > 1: 5774 annotation_field_sep = ";" 5775 else: 5776 annotation_field_sep = "" 5777 5778 log.info( 5779 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5780 ) 5781 5782 # Add INFO field to header 5783 parquet_hdr_vcf_header_infos_number = ( 5784 parquet_hdr_vcf_header_infos[annotation_field].num 5785 or "." 5786 ) 5787 parquet_hdr_vcf_header_infos_type = ( 5788 parquet_hdr_vcf_header_infos[annotation_field].type 5789 or "String" 5790 ) 5791 parquet_hdr_vcf_header_infos_description = ( 5792 parquet_hdr_vcf_header_infos[annotation_field].desc 5793 or f"{annotation_field} description" 5794 ) 5795 parquet_hdr_vcf_header_infos_source = ( 5796 parquet_hdr_vcf_header_infos[annotation_field].source 5797 or "unknown" 5798 ) 5799 parquet_hdr_vcf_header_infos_version = ( 5800 parquet_hdr_vcf_header_infos[annotation_field].version 5801 or "unknown" 5802 ) 5803 5804 vcf_reader.infos[annotation_fields_new_name] = ( 5805 vcf.parser._Info( 5806 annotation_fields_new_name, 5807 parquet_hdr_vcf_header_infos_number, 5808 parquet_hdr_vcf_header_infos_type, 5809 parquet_hdr_vcf_header_infos_description, 5810 parquet_hdr_vcf_header_infos_source, 5811 parquet_hdr_vcf_header_infos_version, 5812 self.code_type_map[ 5813 parquet_hdr_vcf_header_infos_type 5814 ], 5815 ) 5816 ) 5817 5818 # Append 5819 if force_append_annotation: 5820 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5821 else: 5822 query_case_when_append = "" 5823 5824 # Annotation/Update query fields 5825 # Found in INFO column 5826 if ( 5827 annotation_field_column == "INFO" 5828 and "INFO" in parquet_hdr_vcf_header_columns 5829 ): 5830 sql_query_annotation_update_info_sets.append( 5831 f""" 5832 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5833 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5834 ELSE '' 5835 END 5836 """ 5837 ) 5838 # Found in a specific column 5839 else: 5840 sql_query_annotation_update_info_sets.append( 5841 f""" 5842 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 5843 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 5844 ELSE '' 5845 END 5846 """ 5847 ) 5848 sql_query_annotation_to_agregate.append( 5849 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5850 ) 5851 5852 # Not to annotate 5853 else: 5854 5855 if force_update_annotation: 5856 annotation_message = "forced" 5857 else: 5858 annotation_message = "skipped" 5859 5860 if annotation_field not in parquet_hdr_vcf_header_infos: 5861 log.warning( 5862 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5863 ) 5864 if annotation_fields_new_name in self.get_header().infos: 5865 log.warning( 5866 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5867 ) 5868 5869 # Check if ALL fields have to be annotated. Thus concat all INFO field 5870 # allow_annotation_full_info = True 5871 allow_annotation_full_info = not force_append_annotation 5872 5873 if parquet_type in ["regions"]: 5874 allow_annotation_full_info = False 5875 5876 if ( 5877 allow_annotation_full_info 5878 and nb_annotation_field == len(annotation_fields) 5879 and annotation_fields_all 5880 and ( 5881 "INFO" in parquet_hdr_vcf_header_columns 5882 and "INFO" in database.get_extra_columns() 5883 ) 5884 ): 5885 log.debug("Column INFO annotation enabled") 5886 sql_query_annotation_update_info_sets = [] 5887 sql_query_annotation_update_info_sets.append( 5888 f" table_parquet.INFO " 5889 ) 5890 5891 if sql_query_annotation_update_info_sets: 5892 5893 # Annotate 5894 log.info(f"Annotation '{annotation_name}' - Annotation...") 5895 5896 # Join query annotation update info sets for SQL 5897 sql_query_annotation_update_info_sets_sql = ",".join( 5898 sql_query_annotation_update_info_sets 5899 ) 5900 5901 # Check chromosomes list (and variants infos) 5902 sql_query_chromosomes = f""" 5903 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5904 FROM {table_variants} as table_variants 5905 GROUP BY table_variants."#CHROM" 5906 ORDER BY table_variants."#CHROM" 5907 """ 5908 sql_query_chromosomes_df = self.conn.execute( 5909 sql_query_chromosomes 5910 ).df() 5911 sql_query_chromosomes_dict = { 5912 entry["CHROM"]: { 5913 "count": entry["count_variants"], 5914 "min": entry["min_variants"], 5915 "max": entry["max_variants"], 5916 } 5917 for index, entry in sql_query_chromosomes_df.iterrows() 5918 } 5919 5920 # Init 5921 nb_of_query = 0 5922 nb_of_variant_annotated = 0 5923 query_dict = query_dict_remove 5924 5925 # for chrom in sql_query_chromosomes_df["CHROM"]: 5926 for chrom in sql_query_chromosomes_dict: 5927 5928 # Number of variant by chromosome 5929 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5930 chrom, {} 5931 ).get("count", 0) 5932 5933 log.debug( 5934 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5935 ) 5936 5937 # Annotation with regions database 5938 if parquet_type in ["regions"]: 5939 sql_query_annotation_from_clause = f""" 5940 FROM ( 5941 SELECT 5942 '{chrom}' AS \"#CHROM\", 5943 table_variants_from.\"POS\" AS \"POS\", 5944 {",".join(sql_query_annotation_to_agregate)} 5945 FROM {table_variants} as table_variants_from 5946 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5947 table_parquet_from."#CHROM" = '{chrom}' 5948 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5949 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5950 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5951 ) 5952 ) 5953 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5954 GROUP BY table_variants_from.\"POS\" 5955 ) 5956 as table_parquet 5957 """ 5958 5959 sql_query_annotation_where_clause = """ 5960 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5961 AND table_parquet.\"POS\" = table_variants.\"POS\" 5962 """ 5963 5964 # Annotation with variants database 5965 else: 5966 sql_query_annotation_from_clause = f""" 5967 FROM {parquet_file_link} as table_parquet 5968 """ 5969 sql_query_annotation_where_clause = f""" 5970 table_variants."#CHROM" = '{chrom}' 5971 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5972 AND table_parquet.\"POS\" = table_variants.\"POS\" 5973 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5974 AND table_parquet.\"REF\" = table_variants.\"REF\" 5975 """ 5976 5977 # Create update query 5978 sql_query_annotation_chrom_interval_pos = f""" 5979 UPDATE {table_variants} as table_variants 5980 SET INFO = 5981 concat( 5982 CASE WHEN table_variants.INFO NOT IN ('','.') 5983 THEN table_variants.INFO 5984 ELSE '' 5985 END 5986 , 5987 CASE WHEN table_variants.INFO NOT IN ('','.') 5988 AND ( 5989 concat({sql_query_annotation_update_info_sets_sql}) 5990 ) 5991 NOT IN ('','.') 5992 THEN ';' 5993 ELSE '' 5994 END 5995 , 5996 {sql_query_annotation_update_info_sets_sql} 5997 ) 5998 {sql_query_annotation_from_clause} 5999 WHERE {sql_query_annotation_where_clause} 6000 ; 6001 """ 6002 6003 # Add update query to dict 6004 query_dict[ 6005 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6006 ] = sql_query_annotation_chrom_interval_pos 6007 6008 nb_of_query = len(query_dict) 6009 num_query = 0 6010 6011 # SET max_expression_depth TO x 6012 self.conn.execute("SET max_expression_depth TO 10000") 6013 6014 for query_name in query_dict: 6015 query = query_dict[query_name] 6016 num_query += 1 6017 log.info( 6018 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6019 ) 6020 result = self.conn.execute(query) 6021 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6022 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6023 log.info( 6024 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6025 ) 6026 6027 log.info( 6028 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6029 ) 6030 6031 else: 6032 6033 log.info( 6034 f"Annotation '{annotation_name}' - No Annotations available" 6035 ) 6036 6037 log.debug("Final header: " + str(vcf_reader.infos)) 6038 6039 # Remove added columns 6040 for added_column in added_columns: 6041 self.drop_column(column=added_column) 6042 6043 def annotation_splice(self, threads: int = None) -> None: 6044 """ 6045 This function annotate with snpEff 6046 6047 :param threads: The number of threads to use 6048 :return: the value of the variable "return_value". 6049 """ 6050 6051 # DEBUG 6052 log.debug("Start annotation with splice tools") 6053 6054 # Threads 6055 if not threads: 6056 threads = self.get_threads() 6057 log.debug("Threads: " + str(threads)) 6058 6059 # DEBUG 6060 delete_tmp = True 6061 if self.get_config().get("verbosity", "warning") in ["debug"]: 6062 delete_tmp = False 6063 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6064 6065 # Config 6066 config = self.get_config() 6067 log.debug("Config: " + str(config)) 6068 splice_config = config.get("tools", {}).get("splice", {}) 6069 if not splice_config: 6070 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6071 if not splice_config: 6072 msg_err = "No Splice tool config" 6073 log.error(msg_err) 6074 raise ValueError(msg_err) 6075 log.debug(f"splice_config={splice_config}") 6076 6077 # Config - Folders - Databases 6078 databases_folders = ( 6079 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6080 ) 6081 log.debug("Databases annotations: " + str(databases_folders)) 6082 6083 # Splice docker image 6084 splice_docker_image = splice_config.get("docker").get("image") 6085 6086 # Pull splice image if it's not already there 6087 if not check_docker_image_exists(splice_docker_image): 6088 log.warning( 6089 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6090 ) 6091 try: 6092 command(f"docker pull {splice_config.get('docker').get('image')}") 6093 except subprocess.CalledProcessError: 6094 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6095 log.error(msg_err) 6096 raise ValueError(msg_err) 6097 return None 6098 6099 # Config - splice databases 6100 splice_databases = ( 6101 config.get("folders", {}) 6102 .get("databases", {}) 6103 .get("splice", DEFAULT_SPLICE_FOLDER) 6104 ) 6105 splice_databases = full_path(splice_databases) 6106 6107 # Param 6108 param = self.get_param() 6109 log.debug("Param: " + str(param)) 6110 6111 # Param 6112 options = param.get("annotation", {}).get("splice", {}) 6113 log.debug("Options: " + str(options)) 6114 6115 # Data 6116 table_variants = self.get_table_variants() 6117 6118 # Check if not empty 6119 log.debug("Check if not empty") 6120 sql_query_chromosomes = ( 6121 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6122 ) 6123 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6124 log.info("VCF empty") 6125 return None 6126 6127 # Export in VCF 6128 log.debug("Create initial file to annotate") 6129 6130 # Create output folder 6131 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6132 if not os.path.exists(output_folder): 6133 Path(output_folder).mkdir(parents=True, exist_ok=True) 6134 6135 # Create tmp VCF file 6136 tmp_vcf = NamedTemporaryFile( 6137 prefix=self.get_prefix(), 6138 dir=output_folder, 6139 suffix=".vcf", 6140 delete=False, 6141 ) 6142 tmp_vcf_name = tmp_vcf.name 6143 6144 # VCF header 6145 header = self.get_header() 6146 6147 # Existing annotations 6148 for vcf_annotation in self.get_header().infos: 6149 6150 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6151 log.debug( 6152 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6153 ) 6154 6155 # Memory limit 6156 if config.get("memory", None): 6157 memory_limit = config.get("memory", "8G").upper() 6158 # upper() 6159 else: 6160 memory_limit = "8G" 6161 log.debug(f"memory_limit: {memory_limit}") 6162 6163 # Check number of variants to annotate 6164 where_clause_regex_spliceai = r"SpliceAI_\w+" 6165 where_clause_regex_spip = r"SPiP_\w+" 6166 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6167 df_list_of_variants_to_annotate = self.get_query_to_df( 6168 query=f""" SELECT * FROM variants {where_clause} """ 6169 ) 6170 if len(df_list_of_variants_to_annotate) == 0: 6171 log.warning( 6172 f"No variants to annotate with splice. Variants probably already annotated with splice" 6173 ) 6174 return None 6175 else: 6176 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6177 6178 # Export VCF file 6179 self.export_variant_vcf( 6180 vcf_file=tmp_vcf_name, 6181 remove_info=True, 6182 add_samples=True, 6183 index=False, 6184 where_clause=where_clause, 6185 ) 6186 6187 # Create docker container and launch splice analysis 6188 if splice_config: 6189 6190 # Splice mount folders 6191 mount_folders = splice_config.get("mount", {}) 6192 6193 # Genome mount 6194 mount_folders[ 6195 config.get("folders", {}) 6196 .get("databases", {}) 6197 .get("genomes", DEFAULT_GENOME_FOLDER) 6198 ] = "ro" 6199 6200 # SpliceAI mount 6201 mount_folders[ 6202 config.get("folders", {}) 6203 .get("databases", {}) 6204 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6205 ] = "ro" 6206 6207 # Genome mount 6208 mount_folders[ 6209 config.get("folders", {}) 6210 .get("databases", {}) 6211 .get("spip", DEFAULT_SPIP_FOLDER) 6212 ] = "ro" 6213 6214 # Mount folders 6215 mount = [] 6216 6217 # Config mount 6218 mount = [ 6219 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6220 for path, mode in mount_folders.items() 6221 ] 6222 6223 if any(value for value in splice_config.values() if value is None): 6224 log.warning("At least one splice config parameter is empty") 6225 return None 6226 6227 # Params in splice nf 6228 def check_values(dico: dict): 6229 """ 6230 Ensure parameters for NF splice pipeline 6231 """ 6232 for key, val in dico.items(): 6233 if key == "genome": 6234 if any( 6235 assemb in options.get("genome", {}) 6236 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6237 ): 6238 yield f"--{key} hg19" 6239 elif any( 6240 assemb in options.get("genome", {}) 6241 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6242 ): 6243 yield f"--{key} hg38" 6244 elif ( 6245 (isinstance(val, str) and val) 6246 or isinstance(val, int) 6247 or isinstance(val, bool) 6248 ): 6249 yield f"--{key} {val}" 6250 6251 # Genome 6252 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6253 options["genome"] = genome 6254 6255 # NF params 6256 nf_params = [] 6257 6258 # Add options 6259 if options: 6260 nf_params = list(check_values(options)) 6261 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6262 else: 6263 log.debug("No NF params provided") 6264 6265 # Add threads 6266 if "threads" not in options.keys(): 6267 nf_params.append(f"--threads {threads}") 6268 6269 # Genome path 6270 genome_path = find_genome( 6271 config.get("folders", {}) 6272 .get("databases", {}) 6273 .get("genomes", DEFAULT_GENOME_FOLDER), 6274 file=f"{genome}.fa", 6275 ) 6276 # Add genome path 6277 if not genome_path: 6278 raise ValueError( 6279 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6280 ) 6281 else: 6282 log.debug(f"Genome: {genome_path}") 6283 nf_params.append(f"--genome_path {genome_path}") 6284 6285 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6286 """ 6287 Setting up updated databases for SPiP and SpliceAI 6288 """ 6289 6290 try: 6291 6292 # SpliceAI assembly transcriptome 6293 spliceai_assembly = os.path.join( 6294 config.get("folders", {}) 6295 .get("databases", {}) 6296 .get("spliceai", {}), 6297 options.get("genome"), 6298 "transcriptome", 6299 ) 6300 spip_assembly = options.get("genome") 6301 6302 spip = find( 6303 f"transcriptome_{spip_assembly}.RData", 6304 config.get("folders", {}).get("databases", {}).get("spip", {}), 6305 ) 6306 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6307 log.debug(f"SPiP annotations: {spip}") 6308 log.debug(f"SpliceAI annotations: {spliceai}") 6309 if spip and spliceai: 6310 return [ 6311 f"--spip_transcriptome {spip}", 6312 f"--spliceai_annotations {spliceai}", 6313 ] 6314 else: 6315 # TODO crash and go on with basic annotations ? 6316 # raise ValueError( 6317 # "Can't find splice databases in configuration EXIT" 6318 # ) 6319 log.warning( 6320 "Can't find splice databases in configuration, use annotations file from image" 6321 ) 6322 except TypeError: 6323 log.warning( 6324 "Can't find splice databases in configuration, use annotations file from image" 6325 ) 6326 return [] 6327 6328 # Add options, check if transcriptome option have already beend provided 6329 if ( 6330 "spip_transcriptome" not in nf_params 6331 and "spliceai_transcriptome" not in nf_params 6332 ): 6333 splice_reference = splice_annotations(options, config) 6334 if splice_reference: 6335 nf_params.extend(splice_reference) 6336 6337 nf_params.append(f"--output_folder {output_folder}") 6338 6339 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6340 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6341 log.debug(cmd) 6342 6343 splice_config["docker"]["command"] = cmd 6344 6345 docker_cmd = get_bin_command( 6346 tool="splice", 6347 bin_type="docker", 6348 config=config, 6349 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6350 add_options=f"--name {random_uuid} {' '.join(mount)}", 6351 ) 6352 6353 # Docker debug 6354 # if splice_config.get("rm_container"): 6355 # rm_container = "--rm" 6356 # else: 6357 # rm_container = "" 6358 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6359 6360 log.debug(docker_cmd) 6361 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6362 log.debug(res.stdout) 6363 if res.stderr: 6364 log.error(res.stderr) 6365 res.check_returncode() 6366 else: 6367 log.warning(f"Splice tool configuration not found: {config}") 6368 6369 # Update variants 6370 log.info("Annotation - Updating...") 6371 # Test find output vcf 6372 log.debug( 6373 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6374 ) 6375 output_vcf = [] 6376 # Wrong folder to look in 6377 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6378 if ( 6379 files 6380 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6381 ): 6382 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6383 # log.debug(os.listdir(options.get("output_folder"))) 6384 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6385 if not output_vcf: 6386 log.debug( 6387 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6388 ) 6389 else: 6390 # Get new header from annotated vcf 6391 log.debug(f"Initial header: {len(header.infos)} fields") 6392 # Create new header with splice infos 6393 new_vcf = Variants(input=output_vcf[0]) 6394 new_vcf_header = new_vcf.get_header().infos 6395 for keys, infos in new_vcf_header.items(): 6396 if keys not in header.infos.keys(): 6397 header.infos[keys] = infos 6398 log.debug(f"New header: {len(header.infos)} fields") 6399 log.debug(f"Splice tmp output: {output_vcf[0]}") 6400 self.update_from_vcf(output_vcf[0]) 6401 6402 # Remove folder 6403 remove_if_exists(output_folder) 6404 6405 ### 6406 # Prioritization 6407 ### 6408 6409 def get_config_default(self, name: str) -> dict: 6410 """ 6411 The function `get_config_default` returns a dictionary containing default configurations for 6412 various calculations and prioritizations. 6413 6414 :param name: The `get_config_default` function returns a dictionary containing default 6415 configurations for different calculations and prioritizations. The `name` parameter is used to 6416 specify which specific configuration to retrieve from the dictionary 6417 :type name: str 6418 :return: The function `get_config_default` returns a dictionary containing default configuration 6419 settings for different calculations and prioritizations. The specific configuration settings are 6420 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6421 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6422 returned. If there is no match, an empty dictionary is returned. 6423 """ 6424 6425 config_default = { 6426 "calculations": { 6427 "variant_chr_pos_alt_ref": { 6428 "type": "sql", 6429 "name": "variant_chr_pos_alt_ref", 6430 "description": "Create a variant ID with chromosome, position, alt and ref", 6431 "available": False, 6432 "output_column_name": "variant_chr_pos_alt_ref", 6433 "output_column_type": "String", 6434 "output_column_description": "variant ID with chromosome, position, alt and ref", 6435 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6436 "operation_info": True, 6437 }, 6438 "VARTYPE": { 6439 "type": "sql", 6440 "name": "VARTYPE", 6441 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6442 "available": True, 6443 "output_column_name": "VARTYPE", 6444 "output_column_type": "String", 6445 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6446 "operation_query": """ 6447 CASE 6448 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6449 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6450 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6451 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6452 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6453 ELSE 'UNDEFINED' 6454 END 6455 """, 6456 "info_fields": ["SVTYPE"], 6457 "operation_info": True, 6458 }, 6459 "snpeff_hgvs": { 6460 "type": "python", 6461 "name": "snpeff_hgvs", 6462 "description": "HGVS nomenclatures from snpEff annotation", 6463 "available": True, 6464 "function_name": "calculation_extract_snpeff_hgvs", 6465 "function_params": ["snpeff_hgvs", "ANN"], 6466 }, 6467 "snpeff_ann_explode": { 6468 "type": "python", 6469 "name": "snpeff_ann_explode", 6470 "description": "Explode snpEff annotations with uniquify values", 6471 "available": True, 6472 "function_name": "calculation_snpeff_ann_explode", 6473 "function_params": [False, "fields", "snpeff_", "ANN"], 6474 }, 6475 "snpeff_ann_explode_uniquify": { 6476 "type": "python", 6477 "name": "snpeff_ann_explode_uniquify", 6478 "description": "Explode snpEff annotations", 6479 "available": True, 6480 "function_name": "calculation_snpeff_ann_explode", 6481 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6482 }, 6483 "snpeff_ann_explode_json": { 6484 "type": "python", 6485 "name": "snpeff_ann_explode_json", 6486 "description": "Explode snpEff annotations in JSON format", 6487 "available": True, 6488 "function_name": "calculation_snpeff_ann_explode", 6489 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6490 }, 6491 "NOMEN": { 6492 "type": "python", 6493 "name": "NOMEN", 6494 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6495 "available": True, 6496 "function_name": "calculation_extract_nomen", 6497 "function_params": [], 6498 }, 6499 "FINDBYPIPELINE": { 6500 "type": "python", 6501 "name": "FINDBYPIPELINE", 6502 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6503 "available": True, 6504 "function_name": "calculation_find_by_pipeline", 6505 "function_params": ["findbypipeline"], 6506 }, 6507 "FINDBYSAMPLE": { 6508 "type": "python", 6509 "name": "FINDBYSAMPLE", 6510 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6511 "available": True, 6512 "function_name": "calculation_find_by_pipeline", 6513 "function_params": ["findbysample"], 6514 }, 6515 "GENOTYPECONCORDANCE": { 6516 "type": "python", 6517 "name": "GENOTYPECONCORDANCE", 6518 "description": "Concordance of genotype for multi caller VCF", 6519 "available": True, 6520 "function_name": "calculation_genotype_concordance", 6521 "function_params": [], 6522 }, 6523 "BARCODE": { 6524 "type": "python", 6525 "name": "BARCODE", 6526 "description": "BARCODE as VaRank tool", 6527 "available": True, 6528 "function_name": "calculation_barcode", 6529 "function_params": [], 6530 }, 6531 "BARCODEFAMILY": { 6532 "type": "python", 6533 "name": "BARCODEFAMILY", 6534 "description": "BARCODEFAMILY as VaRank tool", 6535 "available": True, 6536 "function_name": "calculation_barcode_family", 6537 "function_params": ["BCF"], 6538 }, 6539 "TRIO": { 6540 "type": "python", 6541 "name": "TRIO", 6542 "description": "Inheritance for a trio family", 6543 "available": True, 6544 "function_name": "calculation_trio", 6545 "function_params": [], 6546 }, 6547 "VAF": { 6548 "type": "python", 6549 "name": "VAF", 6550 "description": "Variant Allele Frequency (VAF) harmonization", 6551 "available": True, 6552 "function_name": "calculation_vaf_normalization", 6553 "function_params": [], 6554 }, 6555 "VAF_stats": { 6556 "type": "python", 6557 "name": "VAF_stats", 6558 "description": "Variant Allele Frequency (VAF) statistics", 6559 "available": True, 6560 "function_name": "calculation_genotype_stats", 6561 "function_params": ["VAF"], 6562 }, 6563 "DP_stats": { 6564 "type": "python", 6565 "name": "DP_stats", 6566 "description": "Depth (DP) statistics", 6567 "available": True, 6568 "function_name": "calculation_genotype_stats", 6569 "function_params": ["DP"], 6570 }, 6571 "variant_id": { 6572 "type": "python", 6573 "name": "variant_id", 6574 "description": "Variant ID generated from variant position and type", 6575 "available": True, 6576 "function_name": "calculation_variant_id", 6577 "function_params": [], 6578 }, 6579 "transcripts_json": { 6580 "type": "python", 6581 "name": "transcripts_json", 6582 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6583 "available": True, 6584 "function_name": "calculation_transcripts_annotation", 6585 "function_params": ["transcripts_json", None], 6586 }, 6587 "transcripts_ann": { 6588 "type": "python", 6589 "name": "transcripts_ann", 6590 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6591 "available": True, 6592 "function_name": "calculation_transcripts_annotation", 6593 "function_params": [None, "transcripts_ann"], 6594 }, 6595 "transcripts_annotations": { 6596 "type": "python", 6597 "name": "transcripts_annotations", 6598 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6599 "available": True, 6600 "function_name": "calculation_transcripts_annotation", 6601 "function_params": [None, None], 6602 }, 6603 "transcripts_prioritization": { 6604 "type": "python", 6605 "name": "transcripts_prioritization", 6606 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6607 "available": True, 6608 "function_name": "calculation_transcripts_prioritization", 6609 "function_params": [], 6610 }, 6611 }, 6612 "prioritizations": { 6613 "default": { 6614 "ANN2": [ 6615 { 6616 "type": "contains", 6617 "value": "HIGH", 6618 "score": 5, 6619 "flag": "PASS", 6620 "comment": [ 6621 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6622 ], 6623 }, 6624 { 6625 "type": "contains", 6626 "value": "MODERATE", 6627 "score": 3, 6628 "flag": "PASS", 6629 "comment": [ 6630 "A non-disruptive variant that might change protein effectiveness" 6631 ], 6632 }, 6633 { 6634 "type": "contains", 6635 "value": "LOW", 6636 "score": 0, 6637 "flag": "FILTERED", 6638 "comment": [ 6639 "Assumed to be mostly harmless or unlikely to change protein behavior" 6640 ], 6641 }, 6642 { 6643 "type": "contains", 6644 "value": "MODIFIER", 6645 "score": 0, 6646 "flag": "FILTERED", 6647 "comment": [ 6648 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6649 ], 6650 }, 6651 ], 6652 } 6653 }, 6654 } 6655 6656 return config_default.get(name, None) 6657 6658 def get_config_json( 6659 self, name: str, config_dict: dict = {}, config_file: str = None 6660 ) -> dict: 6661 """ 6662 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6663 default values, a dictionary, and a file. 6664 6665 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6666 the name of the configuration. It is used to identify and retrieve the configuration settings 6667 for a specific component or module 6668 :type name: str 6669 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6670 dictionary that allows you to provide additional configuration settings or overrides. When you 6671 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6672 the key is the configuration setting you want to override or 6673 :type config_dict: dict 6674 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6675 specify the path to a configuration file that contains additional settings. If provided, the 6676 function will read the contents of this file and update the configuration dictionary with the 6677 values found in the file, overriding any existing values with the 6678 :type config_file: str 6679 :return: The function `get_config_json` returns a dictionary containing the configuration 6680 settings. 6681 """ 6682 6683 # Create with default prioritizations 6684 config_default = self.get_config_default(name=name) 6685 configuration = config_default 6686 # log.debug(f"configuration={configuration}") 6687 6688 # Replace prioritizations from dict 6689 for config in config_dict: 6690 configuration[config] = config_dict[config] 6691 6692 # Replace prioritizations from file 6693 config_file = full_path(config_file) 6694 if config_file: 6695 if os.path.exists(config_file): 6696 with open(config_file) as config_file_content: 6697 config_file_dict = json.load(config_file_content) 6698 for config in config_file_dict: 6699 configuration[config] = config_file_dict[config] 6700 else: 6701 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6702 log.error(msg_error) 6703 raise ValueError(msg_error) 6704 6705 return configuration 6706 6707 def prioritization( 6708 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6709 ) -> bool: 6710 """ 6711 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 6712 prioritizes variants based on configured profiles and criteria. 6713 6714 :param table: The `table` parameter in the `prioritization` function is used to specify the name 6715 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 6716 a table name is provided, the method will prioritize the variants in that specific table 6717 :type table: str 6718 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 6719 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 6720 provided, the code will use a default prefix value of "PZ" 6721 :type pz_prefix: str 6722 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 6723 additional parameters specific to the prioritization process. These parameters can include 6724 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 6725 configurations needed for the prioritization of variants in a V 6726 :type pz_param: dict 6727 :return: A boolean value (True) is being returned from the `prioritization` function. 6728 """ 6729 6730 # Config 6731 config = self.get_config() 6732 6733 # Param 6734 param = self.get_param() 6735 6736 # Prioritization param 6737 if pz_param is not None: 6738 prioritization_param = pz_param 6739 else: 6740 prioritization_param = param.get("prioritization", {}) 6741 6742 # Configuration profiles 6743 prioritization_config_file = prioritization_param.get( 6744 "prioritization_config", None 6745 ) 6746 prioritization_config_file = full_path(prioritization_config_file) 6747 prioritizations_config = self.get_config_json( 6748 name="prioritizations", config_file=prioritization_config_file 6749 ) 6750 6751 # Prioritization prefix 6752 pz_prefix_default = "PZ" 6753 if pz_prefix is None: 6754 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 6755 6756 # Prioritization options 6757 profiles = prioritization_param.get("profiles", []) 6758 if isinstance(profiles, str): 6759 profiles = profiles.split(",") 6760 pzfields = prioritization_param.get( 6761 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 6762 ) 6763 if isinstance(pzfields, str): 6764 pzfields = pzfields.split(",") 6765 default_profile = prioritization_param.get("default_profile", None) 6766 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 6767 prioritization_score_mode = prioritization_param.get( 6768 "prioritization_score_mode", "HOWARD" 6769 ) 6770 6771 # Quick Prioritizations 6772 prioritizations = param.get("prioritizations", None) 6773 if prioritizations: 6774 log.info("Quick Prioritization:") 6775 for profile in prioritizations.split(","): 6776 if profile not in profiles: 6777 profiles.append(profile) 6778 log.info(f" {profile}") 6779 6780 # If profile "ALL" provided, all profiles in the config profiles 6781 if "ALL" in profiles: 6782 profiles = list(prioritizations_config.keys()) 6783 6784 for profile in profiles: 6785 if prioritizations_config.get(profile, None): 6786 log.debug(f"Profile '{profile}' configured") 6787 else: 6788 msg_error = f"Profile '{profile}' NOT configured" 6789 log.error(msg_error) 6790 raise ValueError(msg_error) 6791 6792 if profiles: 6793 log.info(f"Prioritization... ") 6794 else: 6795 log.debug(f"No profile defined") 6796 return False 6797 6798 if not default_profile and len(profiles): 6799 default_profile = profiles[0] 6800 6801 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6802 log.debug("Profiles to check: " + str(list(profiles))) 6803 6804 # Variables 6805 if table is not None: 6806 table_variants = table 6807 else: 6808 table_variants = self.get_table_variants(clause="update") 6809 log.debug(f"Table to prioritize: {table_variants}") 6810 6811 # Added columns 6812 added_columns = [] 6813 6814 # Create list of PZfields 6815 # List of PZFields 6816 list_of_pzfields_original = pzfields + [ 6817 pzfield + pzfields_sep + profile 6818 for pzfield in pzfields 6819 for profile in profiles 6820 ] 6821 list_of_pzfields = [] 6822 log.debug(f"{list_of_pzfields_original}") 6823 6824 # Remove existing PZfields to use if exists 6825 for pzfield in list_of_pzfields_original: 6826 if self.get_header().infos.get(pzfield, None) is None: 6827 list_of_pzfields.append(pzfield) 6828 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6829 else: 6830 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6831 6832 if list_of_pzfields: 6833 6834 # Explode Infos prefix 6835 explode_infos_prefix = self.get_explode_infos_prefix() 6836 6837 # PZfields tags description 6838 PZfields_INFOS = { 6839 f"{pz_prefix}Tags": { 6840 "ID": f"{pz_prefix}Tags", 6841 "Number": ".", 6842 "Type": "String", 6843 "Description": "Variant tags based on annotation criteria", 6844 }, 6845 f"{pz_prefix}Score": { 6846 "ID": f"{pz_prefix}Score", 6847 "Number": 1, 6848 "Type": "Integer", 6849 "Description": "Variant score based on annotation criteria", 6850 }, 6851 f"{pz_prefix}Flag": { 6852 "ID": f"{pz_prefix}Flag", 6853 "Number": 1, 6854 "Type": "String", 6855 "Description": "Variant flag based on annotation criteria", 6856 }, 6857 f"{pz_prefix}Comment": { 6858 "ID": f"{pz_prefix}Comment", 6859 "Number": ".", 6860 "Type": "String", 6861 "Description": "Variant comment based on annotation criteria", 6862 }, 6863 f"{pz_prefix}Infos": { 6864 "ID": f"{pz_prefix}Infos", 6865 "Number": ".", 6866 "Type": "String", 6867 "Description": "Variant infos based on annotation criteria", 6868 }, 6869 f"{pz_prefix}Class": { 6870 "ID": f"{pz_prefix}Class", 6871 "Number": ".", 6872 "Type": "String", 6873 "Description": "Variant class based on annotation criteria", 6874 }, 6875 } 6876 6877 # Create INFO fields if not exist 6878 for field in PZfields_INFOS: 6879 field_ID = PZfields_INFOS[field]["ID"] 6880 field_description = PZfields_INFOS[field]["Description"] 6881 if field_ID not in self.get_header().infos and field_ID in pzfields: 6882 field_description = ( 6883 PZfields_INFOS[field]["Description"] 6884 + f", profile {default_profile}" 6885 ) 6886 self.get_header().infos[field_ID] = vcf.parser._Info( 6887 field_ID, 6888 PZfields_INFOS[field]["Number"], 6889 PZfields_INFOS[field]["Type"], 6890 field_description, 6891 "unknown", 6892 "unknown", 6893 code_type_map[PZfields_INFOS[field]["Type"]], 6894 ) 6895 6896 # Create INFO fields if not exist for each profile 6897 for profile in prioritizations_config: 6898 if profile in profiles or profiles == []: 6899 for field in PZfields_INFOS: 6900 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6901 field_description = ( 6902 PZfields_INFOS[field]["Description"] 6903 + f", profile {profile}" 6904 ) 6905 if ( 6906 field_ID not in self.get_header().infos 6907 and field in pzfields 6908 ): 6909 self.get_header().infos[field_ID] = vcf.parser._Info( 6910 field_ID, 6911 PZfields_INFOS[field]["Number"], 6912 PZfields_INFOS[field]["Type"], 6913 field_description, 6914 "unknown", 6915 "unknown", 6916 code_type_map[PZfields_INFOS[field]["Type"]], 6917 ) 6918 6919 # Header 6920 for pzfield in list_of_pzfields: 6921 if re.match(f"{pz_prefix}Score.*", pzfield): 6922 added_column = self.add_column( 6923 table_name=table_variants, 6924 column_name=pzfield, 6925 column_type="INTEGER", 6926 default_value="0", 6927 ) 6928 elif re.match(f"{pz_prefix}Flag.*", pzfield): 6929 added_column = self.add_column( 6930 table_name=table_variants, 6931 column_name=pzfield, 6932 column_type="BOOLEAN", 6933 default_value="1", 6934 ) 6935 elif re.match(f"{pz_prefix}Class.*", pzfield): 6936 added_column = self.add_column( 6937 table_name=table_variants, 6938 column_name=pzfield, 6939 column_type="VARCHAR[]", 6940 default_value="null", 6941 ) 6942 else: 6943 added_column = self.add_column( 6944 table_name=table_variants, 6945 column_name=pzfield, 6946 column_type="STRING", 6947 default_value="''", 6948 ) 6949 added_columns.append(added_column) 6950 6951 # Profiles 6952 if profiles: 6953 6954 # foreach profile in configuration file 6955 for profile in prioritizations_config: 6956 6957 # If profile is asked in param, or ALL are asked (empty profile []) 6958 if profile in profiles or profiles == []: 6959 log.info(f"Profile '{profile}'") 6960 6961 sql_set_info_option = "" 6962 6963 sql_set_info = [] 6964 6965 # PZ fields set 6966 6967 # PZScore 6968 if ( 6969 f"{pz_prefix}Score{pzfields_sep}{profile}" 6970 in list_of_pzfields 6971 ): 6972 sql_set_info.append( 6973 f""" 6974 concat( 6975 '{pz_prefix}Score{pzfields_sep}{profile}=', 6976 {pz_prefix}Score{pzfields_sep}{profile} 6977 ) 6978 """ 6979 ) 6980 if ( 6981 profile == default_profile 6982 and f"{pz_prefix}Score" in list_of_pzfields 6983 ): 6984 sql_set_info.append( 6985 f""" 6986 concat( 6987 '{pz_prefix}Score=', 6988 {pz_prefix}Score{pzfields_sep}{profile} 6989 ) 6990 """ 6991 ) 6992 6993 # PZFlag 6994 if ( 6995 f"{pz_prefix}Flag{pzfields_sep}{profile}" 6996 in list_of_pzfields 6997 ): 6998 sql_set_info.append( 6999 f""" 7000 concat( 7001 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7002 CASE 7003 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7004 THEN 'PASS' 7005 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7006 THEN 'FILTERED' 7007 END 7008 ) 7009 """ 7010 ) 7011 if ( 7012 profile == default_profile 7013 and f"{pz_prefix}Flag" in list_of_pzfields 7014 ): 7015 sql_set_info.append( 7016 f""" 7017 concat( 7018 '{pz_prefix}Flag=', 7019 CASE 7020 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7021 THEN 'PASS' 7022 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7023 THEN 'FILTERED' 7024 END 7025 ) 7026 """ 7027 ) 7028 7029 # PZClass 7030 if ( 7031 f"{pz_prefix}Class{pzfields_sep}{profile}" 7032 in list_of_pzfields 7033 ): 7034 sql_set_info.append( 7035 f""" 7036 concat( 7037 '{pz_prefix}Class{pzfields_sep}{profile}=', 7038 CASE 7039 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7040 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7041 ELSE '.' 7042 END 7043 ) 7044 7045 """ 7046 ) 7047 if ( 7048 profile == default_profile 7049 and f"{pz_prefix}Class" in list_of_pzfields 7050 ): 7051 sql_set_info.append( 7052 f""" 7053 concat( 7054 '{pz_prefix}Class=', 7055 CASE 7056 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7057 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7058 ELSE '.' 7059 END 7060 ) 7061 """ 7062 ) 7063 7064 # PZComment 7065 if ( 7066 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7067 in list_of_pzfields 7068 ): 7069 sql_set_info.append( 7070 f""" 7071 CASE 7072 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7073 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7074 ELSE '' 7075 END 7076 """ 7077 ) 7078 if ( 7079 profile == default_profile 7080 and f"{pz_prefix}Comment" in list_of_pzfields 7081 ): 7082 sql_set_info.append( 7083 f""" 7084 CASE 7085 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7086 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7087 ELSE '' 7088 END 7089 """ 7090 ) 7091 7092 # PZInfos 7093 if ( 7094 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7095 in list_of_pzfields 7096 ): 7097 sql_set_info.append( 7098 f""" 7099 CASE 7100 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7101 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7102 ELSE '' 7103 END 7104 """ 7105 ) 7106 if ( 7107 profile == default_profile 7108 and f"{pz_prefix}Infos" in list_of_pzfields 7109 ): 7110 sql_set_info.append( 7111 f""" 7112 CASE 7113 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7114 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7115 ELSE '' 7116 END 7117 """ 7118 ) 7119 7120 # Merge PZfields 7121 sql_set_info_option = "" 7122 sql_set_sep = "" 7123 for sql_set in sql_set_info: 7124 if sql_set_sep: 7125 sql_set_info_option += f""" 7126 , concat('{sql_set_sep}', {sql_set}) 7127 """ 7128 else: 7129 sql_set_info_option += f""" 7130 , {sql_set} 7131 """ 7132 sql_set_sep = ";" 7133 7134 sql_queries = [] 7135 for annotation in prioritizations_config[profile]: 7136 7137 # skip special sections 7138 if annotation.startswith("_"): 7139 continue 7140 7141 # For each criterions 7142 for criterion in prioritizations_config[profile][ 7143 annotation 7144 ]: 7145 7146 # Criterion mode 7147 criterion_mode = None 7148 if np.any( 7149 np.isin(list(criterion.keys()), ["type", "value"]) 7150 ): 7151 criterion_mode = "operation" 7152 elif np.any( 7153 np.isin(list(criterion.keys()), ["sql", "fields"]) 7154 ): 7155 criterion_mode = "sql" 7156 log.debug(f"Criterion Mode: {criterion_mode}") 7157 7158 # Criterion parameters 7159 criterion_type = criterion.get("type", None) 7160 criterion_value = criterion.get("value", None) 7161 criterion_sql = criterion.get("sql", None) 7162 criterion_fields = criterion.get("fields", None) 7163 criterion_score = criterion.get("score", 0) 7164 criterion_flag = criterion.get("flag", "PASS") 7165 criterion_class = criterion.get("class", None) 7166 criterion_flag_bool = criterion_flag == "PASS" 7167 criterion_comment = ( 7168 ", ".join(criterion.get("comment", [])) 7169 .replace("'", "''") 7170 .replace(";", ",") 7171 .replace("\t", " ") 7172 ) 7173 criterion_infos = ( 7174 str(criterion) 7175 .replace("'", "''") 7176 .replace(";", ",") 7177 .replace("\t", " ") 7178 ) 7179 7180 # SQL 7181 if criterion_sql is not None and isinstance( 7182 criterion_sql, list 7183 ): 7184 criterion_sql = " ".join(criterion_sql) 7185 7186 # Fields and explode 7187 if criterion_fields is None: 7188 criterion_fields = [annotation] 7189 if not isinstance(criterion_fields, list): 7190 criterion_fields = str(criterion_fields).split(",") 7191 7192 # Class 7193 if criterion_class is not None and not isinstance( 7194 criterion_class, list 7195 ): 7196 criterion_class = str(criterion_class).split(",") 7197 7198 for annotation_field in criterion_fields: 7199 7200 # Explode specific annotation 7201 log.debug( 7202 f"Explode annotation '{annotation_field}'" 7203 ) 7204 added_columns += self.explode_infos( 7205 prefix=explode_infos_prefix, 7206 fields=[annotation_field], 7207 table=table_variants, 7208 ) 7209 extra_infos = self.get_extra_infos( 7210 table=table_variants 7211 ) 7212 7213 # Check if annotation field is present 7214 if ( 7215 f"{explode_infos_prefix}{annotation_field}" 7216 not in extra_infos 7217 ): 7218 msq_err = f"Annotation '{annotation_field}' not in data" 7219 log.error(msq_err) 7220 raise ValueError(msq_err) 7221 else: 7222 log.debug( 7223 f"Annotation '{annotation_field}' in data" 7224 ) 7225 7226 sql_set = [] 7227 sql_set_info = [] 7228 7229 # PZ fields set 7230 7231 # PZScore 7232 if ( 7233 f"{pz_prefix}Score{pzfields_sep}{profile}" 7234 in list_of_pzfields 7235 ): 7236 # if prioritization_score_mode == "HOWARD": 7237 # sql_set.append( 7238 # f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7239 # ) 7240 # VaRank prioritization score mode 7241 if prioritization_score_mode == "VaRank": 7242 sql_set.append( 7243 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7244 ) 7245 # default HOWARD prioritization score mode 7246 else: 7247 sql_set.append( 7248 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7249 ) 7250 7251 # PZFlag 7252 if ( 7253 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7254 in list_of_pzfields 7255 ): 7256 sql_set.append( 7257 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7258 ) 7259 7260 # PZClass 7261 if ( 7262 f"{pz_prefix}Class{pzfields_sep}{profile}" 7263 in list_of_pzfields 7264 and criterion_class is not None 7265 ): 7266 sql_set.append( 7267 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7268 ) 7269 7270 # PZComment 7271 if ( 7272 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7273 in list_of_pzfields 7274 ): 7275 sql_set.append( 7276 f""" 7277 {pz_prefix}Comment{pzfields_sep}{profile} = 7278 concat( 7279 {pz_prefix}Comment{pzfields_sep}{profile}, 7280 CASE 7281 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7282 THEN ', ' 7283 ELSE '' 7284 END, 7285 '{criterion_comment}' 7286 ) 7287 """ 7288 ) 7289 7290 # PZInfos 7291 if ( 7292 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7293 in list_of_pzfields 7294 ): 7295 sql_set.append( 7296 f""" 7297 {pz_prefix}Infos{pzfields_sep}{profile} = 7298 concat( 7299 {pz_prefix}Infos{pzfields_sep}{profile}, 7300 '{criterion_infos}' 7301 ) 7302 """ 7303 ) 7304 sql_set_option = ",".join(sql_set) 7305 7306 # Criterion and comparison 7307 if sql_set_option: 7308 7309 if criterion_mode in ["operation"]: 7310 7311 try: 7312 float(criterion_value) 7313 sql_update = f""" 7314 UPDATE {table_variants} 7315 SET {sql_set_option} 7316 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7317 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7318 """ 7319 except: 7320 contains_option = "" 7321 if criterion_type == "contains": 7322 contains_option = ".*" 7323 sql_update = f""" 7324 UPDATE {table_variants} 7325 SET {sql_set_option} 7326 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7327 """ 7328 sql_queries.append(sql_update) 7329 7330 elif criterion_mode in ["sql"]: 7331 7332 sql_update = f""" 7333 UPDATE {table_variants} 7334 SET {sql_set_option} 7335 WHERE {criterion_sql} 7336 """ 7337 sql_queries.append(sql_update) 7338 7339 else: 7340 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7341 log.error(msg_err) 7342 raise ValueError(msg_err) 7343 7344 else: 7345 log.warning( 7346 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7347 ) 7348 7349 # PZTags 7350 if ( 7351 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7352 in list_of_pzfields 7353 ): 7354 7355 # Create PZFalgs value 7356 pztags_value = "" 7357 pztags_sep_default = "," 7358 pztags_sep = "" 7359 for pzfield in pzfields: 7360 if pzfield not in [f"{pz_prefix}Tags"]: 7361 if ( 7362 f"{pzfield}{pzfields_sep}{profile}" 7363 in list_of_pzfields 7364 ): 7365 if pzfield in [f"{pz_prefix}Flag"]: 7366 pztags_value += f"""{pztags_sep}{pzfield}#', 7367 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7368 THEN 'PASS' 7369 ELSE 'FILTERED' 7370 END, '""" 7371 elif pzfield in [f"{pz_prefix}Class"]: 7372 pztags_value += f"""{pztags_sep}{pzfield}#', 7373 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7374 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7375 ELSE '.' 7376 END, '""" 7377 else: 7378 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7379 pztags_sep = pztags_sep_default 7380 7381 # Add Query update for PZFlags 7382 sql_update_pztags = f""" 7383 UPDATE {table_variants} 7384 SET INFO = concat( 7385 INFO, 7386 CASE WHEN INFO NOT in ('','.') 7387 THEN ';' 7388 ELSE '' 7389 END, 7390 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7391 ) 7392 """ 7393 sql_queries.append(sql_update_pztags) 7394 7395 # Add Query update for PZFlags for default 7396 if profile == default_profile: 7397 sql_update_pztags_default = f""" 7398 UPDATE {table_variants} 7399 SET INFO = concat( 7400 INFO, 7401 ';', 7402 '{pz_prefix}Tags={pztags_value}' 7403 ) 7404 """ 7405 sql_queries.append(sql_update_pztags_default) 7406 7407 log.info(f"""Profile '{profile}' - Prioritization... """) 7408 7409 if sql_queries: 7410 7411 for sql_query in sql_queries: 7412 log.debug( 7413 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7414 ) 7415 self.conn.execute(sql_query) 7416 7417 log.info(f"""Profile '{profile}' - Update... """) 7418 sql_query_update = f""" 7419 UPDATE {table_variants} 7420 SET INFO = 7421 concat( 7422 CASE 7423 WHEN INFO NOT IN ('','.') 7424 THEN concat(INFO, ';') 7425 ELSE '' 7426 END 7427 {sql_set_info_option} 7428 ) 7429 """ 7430 self.conn.execute(sql_query_update) 7431 7432 else: 7433 7434 log.warning(f"No profiles in parameters") 7435 7436 # Remove added columns 7437 for added_column in added_columns: 7438 self.drop_column(column=added_column) 7439 7440 # Explode INFOS fields into table fields 7441 if self.get_explode_infos(): 7442 self.explode_infos( 7443 prefix=self.get_explode_infos_prefix(), 7444 fields=self.get_explode_infos_fields(), 7445 force=True, 7446 ) 7447 7448 return True 7449 7450 ### 7451 # HGVS 7452 ### 7453 7454 def annotation_hgvs(self, threads: int = None) -> None: 7455 """ 7456 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7457 coordinates and alleles. 7458 7459 :param threads: The `threads` parameter is an optional integer that specifies the number of 7460 threads to use for parallel processing. If no value is provided, it will default to the number 7461 of threads obtained from the `get_threads()` method 7462 :type threads: int 7463 """ 7464 7465 # Function for each partition of the Dask Dataframe 7466 def partition_function(partition): 7467 """ 7468 The function `partition_function` applies the `annotation_hgvs_partition` function to 7469 each row of a DataFrame called `partition`. 7470 7471 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7472 to be processed 7473 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7474 the "partition" dataframe along the axis 1. 7475 """ 7476 return partition.apply(annotation_hgvs_partition, axis=1) 7477 7478 def annotation_hgvs_partition(row) -> str: 7479 """ 7480 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7481 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7482 7483 :param row: A dictionary-like object that contains the values for the following keys: 7484 :return: a string that contains the HGVS names associated with the given row of data. 7485 """ 7486 7487 chr = row["CHROM"] 7488 pos = row["POS"] 7489 ref = row["REF"] 7490 alt = row["ALT"] 7491 7492 # Find list of associated transcripts 7493 transcripts_list = list( 7494 polars_conn.execute( 7495 f""" 7496 SELECT transcript 7497 FROM refseq_df 7498 WHERE CHROM='{chr}' 7499 AND POS={pos} 7500 """ 7501 )["transcript"] 7502 ) 7503 7504 # Full HGVS annotation in list 7505 hgvs_full_list = [] 7506 7507 for transcript_name in transcripts_list: 7508 7509 # Transcript 7510 transcript = get_transcript( 7511 transcripts=transcripts, transcript_name=transcript_name 7512 ) 7513 # Exon 7514 if use_exon: 7515 exon = transcript.find_exon_number(pos) 7516 else: 7517 exon = None 7518 # Protein 7519 transcript_protein = None 7520 if use_protein or add_protein or full_format: 7521 transcripts_protein = list( 7522 polars_conn.execute( 7523 f""" 7524 SELECT protein 7525 FROM refseqlink_df 7526 WHERE transcript='{transcript_name}' 7527 LIMIT 1 7528 """ 7529 )["protein"] 7530 ) 7531 if len(transcripts_protein): 7532 transcript_protein = transcripts_protein[0] 7533 7534 # HGVS name 7535 hgvs_name = format_hgvs_name( 7536 chr, 7537 pos, 7538 ref, 7539 alt, 7540 genome=genome, 7541 transcript=transcript, 7542 transcript_protein=transcript_protein, 7543 exon=exon, 7544 use_gene=use_gene, 7545 use_protein=use_protein, 7546 full_format=full_format, 7547 use_version=use_version, 7548 codon_type=codon_type, 7549 ) 7550 hgvs_full_list.append(hgvs_name) 7551 if add_protein and not use_protein and not full_format: 7552 hgvs_name = format_hgvs_name( 7553 chr, 7554 pos, 7555 ref, 7556 alt, 7557 genome=genome, 7558 transcript=transcript, 7559 transcript_protein=transcript_protein, 7560 exon=exon, 7561 use_gene=use_gene, 7562 use_protein=True, 7563 full_format=False, 7564 use_version=use_version, 7565 codon_type=codon_type, 7566 ) 7567 hgvs_full_list.append(hgvs_name) 7568 7569 # Create liste of HGVS annotations 7570 hgvs_full = ",".join(hgvs_full_list) 7571 7572 return hgvs_full 7573 7574 # Polars connexion 7575 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7576 7577 # Config 7578 config = self.get_config() 7579 7580 # Databases 7581 # Genome 7582 databases_genomes_folders = ( 7583 config.get("folders", {}) 7584 .get("databases", {}) 7585 .get("genomes", DEFAULT_GENOME_FOLDER) 7586 ) 7587 databases_genome = ( 7588 config.get("folders", {}).get("databases", {}).get("genomes", "") 7589 ) 7590 # refseq database folder 7591 databases_refseq_folders = ( 7592 config.get("folders", {}) 7593 .get("databases", {}) 7594 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7595 ) 7596 # refseq 7597 databases_refseq = config.get("databases", {}).get("refSeq", None) 7598 # refSeqLink 7599 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7600 7601 # Param 7602 param = self.get_param() 7603 7604 # Quick HGVS 7605 if "hgvs_options" in param and param.get("hgvs_options", ""): 7606 log.info(f"Quick HGVS Annotation:") 7607 if not param.get("hgvs", None): 7608 param["hgvs"] = {} 7609 for option in param.get("hgvs_options", "").split(","): 7610 option_var_val = option.split("=") 7611 option_var = option_var_val[0] 7612 if len(option_var_val) > 1: 7613 option_val = option_var_val[1] 7614 else: 7615 option_val = "True" 7616 if option_val.upper() in ["TRUE"]: 7617 option_val = True 7618 elif option_val.upper() in ["FALSE"]: 7619 option_val = False 7620 log.info(f" {option_var}={option_val}") 7621 param["hgvs"][option_var] = option_val 7622 7623 # Check if HGVS annotation enabled 7624 if "hgvs" in param: 7625 log.info(f"HGVS Annotation... ") 7626 for hgvs_option in param.get("hgvs", {}): 7627 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7628 else: 7629 return 7630 7631 # HGVS Param 7632 param_hgvs = param.get("hgvs", {}) 7633 use_exon = param_hgvs.get("use_exon", False) 7634 use_gene = param_hgvs.get("use_gene", False) 7635 use_protein = param_hgvs.get("use_protein", False) 7636 add_protein = param_hgvs.get("add_protein", False) 7637 full_format = param_hgvs.get("full_format", False) 7638 use_version = param_hgvs.get("use_version", False) 7639 codon_type = param_hgvs.get("codon_type", "3") 7640 7641 # refSseq refSeqLink 7642 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7643 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7644 7645 # Assembly 7646 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7647 7648 # Genome 7649 genome_file = None 7650 if find_genome(databases_genome): 7651 genome_file = find_genome(databases_genome) 7652 else: 7653 genome_file = find_genome( 7654 genome_path=databases_genomes_folders, assembly=assembly 7655 ) 7656 log.debug("Genome: " + str(genome_file)) 7657 7658 # refSseq 7659 refseq_file = find_file_prefix( 7660 input_file=databases_refseq, 7661 prefix="ncbiRefSeq", 7662 folder=databases_refseq_folders, 7663 assembly=assembly, 7664 ) 7665 log.debug("refSeq: " + str(refseq_file)) 7666 7667 # refSeqLink 7668 refseqlink_file = find_file_prefix( 7669 input_file=databases_refseqlink, 7670 prefix="ncbiRefSeqLink", 7671 folder=databases_refseq_folders, 7672 assembly=assembly, 7673 ) 7674 log.debug("refSeqLink: " + str(refseqlink_file)) 7675 7676 # Threads 7677 if not threads: 7678 threads = self.get_threads() 7679 log.debug("Threads: " + str(threads)) 7680 7681 # Variables 7682 table_variants = self.get_table_variants(clause="update") 7683 7684 # Get variants SNV and InDel only 7685 query_variants = f""" 7686 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7687 FROM {table_variants} 7688 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7689 """ 7690 df_variants = self.get_query_to_df(query_variants) 7691 7692 # Added columns 7693 added_columns = [] 7694 7695 # Add hgvs column in variants table 7696 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7697 added_column = self.add_column( 7698 table_variants, hgvs_column_name, "STRING", default_value=None 7699 ) 7700 added_columns.append(added_column) 7701 7702 log.debug(f"refSeq loading...") 7703 # refSeq in duckDB 7704 refseq_table = get_refseq_table( 7705 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7706 ) 7707 # Loading all refSeq in Dataframe 7708 refseq_query = f""" 7709 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7710 FROM {refseq_table} 7711 JOIN df_variants ON ( 7712 {refseq_table}.chrom = df_variants.CHROM 7713 AND {refseq_table}.txStart<=df_variants.POS 7714 AND {refseq_table}.txEnd>=df_variants.POS 7715 ) 7716 """ 7717 refseq_df = self.conn.query(refseq_query).pl() 7718 7719 if refseqlink_file: 7720 log.debug(f"refSeqLink loading...") 7721 # refSeqLink in duckDB 7722 refseqlink_table = get_refseq_table( 7723 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7724 ) 7725 # Loading all refSeqLink in Dataframe 7726 protacc_column = "protAcc_with_ver" 7727 mrnaacc_column = "mrnaAcc_with_ver" 7728 refseqlink_query = f""" 7729 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7730 FROM {refseqlink_table} 7731 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7732 WHERE protAcc_without_ver IS NOT NULL 7733 """ 7734 # Polars Dataframe 7735 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7736 7737 # Read RefSeq transcripts into a python dict/model. 7738 log.debug(f"Transcripts loading...") 7739 with tempfile.TemporaryDirectory() as tmpdir: 7740 transcripts_query = f""" 7741 COPY ( 7742 SELECT {refseq_table}.* 7743 FROM {refseq_table} 7744 JOIN df_variants ON ( 7745 {refseq_table}.chrom=df_variants.CHROM 7746 AND {refseq_table}.txStart<=df_variants.POS 7747 AND {refseq_table}.txEnd>=df_variants.POS 7748 ) 7749 ) 7750 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7751 """ 7752 self.conn.query(transcripts_query) 7753 with open(f"{tmpdir}/transcript.tsv") as infile: 7754 transcripts = read_transcripts(infile) 7755 7756 # Polars connexion 7757 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7758 7759 log.debug("Genome loading...") 7760 # Read genome sequence using pyfaidx. 7761 genome = Fasta(genome_file) 7762 7763 log.debug("Start annotation HGVS...") 7764 7765 # Create 7766 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7767 ddf = dd.from_pandas(df_variants, npartitions=threads) 7768 7769 # Use dask.dataframe.apply() to apply function on each partition 7770 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7771 7772 # Convert Dask DataFrame to Pandas Dataframe 7773 df = ddf.compute() 7774 7775 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7776 with tempfile.TemporaryDirectory() as tmpdir: 7777 df_parquet = os.path.join(tmpdir, "df.parquet") 7778 df.to_parquet(df_parquet) 7779 7780 # Update hgvs column 7781 update_variant_query = f""" 7782 UPDATE {table_variants} 7783 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7784 FROM read_parquet('{df_parquet}') as df 7785 WHERE variants."#CHROM" = df.CHROM 7786 AND variants.POS = df.POS 7787 AND variants.REF = df.REF 7788 AND variants.ALT = df.ALT 7789 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7790 """ 7791 self.execute_query(update_variant_query) 7792 7793 # Update INFO column 7794 sql_query_update = f""" 7795 UPDATE {table_variants} 7796 SET INFO = 7797 concat( 7798 CASE 7799 WHEN INFO NOT IN ('','.') 7800 THEN concat(INFO, ';') 7801 ELSE '' 7802 END, 7803 'hgvs=', 7804 {hgvs_column_name} 7805 ) 7806 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7807 """ 7808 self.execute_query(sql_query_update) 7809 7810 # Add header 7811 HGVS_INFOS = { 7812 "hgvs": { 7813 "ID": "hgvs", 7814 "Number": ".", 7815 "Type": "String", 7816 "Description": f"HGVS annotatation with HOWARD", 7817 } 7818 } 7819 7820 for field in HGVS_INFOS: 7821 field_ID = HGVS_INFOS[field]["ID"] 7822 field_description = HGVS_INFOS[field]["Description"] 7823 self.get_header().infos[field_ID] = vcf.parser._Info( 7824 field_ID, 7825 HGVS_INFOS[field]["Number"], 7826 HGVS_INFOS[field]["Type"], 7827 field_description, 7828 "unknown", 7829 "unknown", 7830 code_type_map[HGVS_INFOS[field]["Type"]], 7831 ) 7832 7833 # Remove added columns 7834 for added_column in added_columns: 7835 self.drop_column(column=added_column) 7836 7837 ### 7838 # Calculation 7839 ### 7840 7841 def get_operations_help( 7842 self, operations_config_dict: dict = {}, operations_config_file: str = None 7843 ) -> list: 7844 7845 # Init 7846 operations_help = [] 7847 7848 # operations 7849 operations = self.get_config_json( 7850 name="calculations", 7851 config_dict=operations_config_dict, 7852 config_file=operations_config_file, 7853 ) 7854 for op in operations: 7855 op_name = operations[op].get("name", op).upper() 7856 op_description = operations[op].get("description", op_name) 7857 op_available = operations[op].get("available", False) 7858 if op_available: 7859 operations_help.append(f" {op_name}: {op_description}") 7860 7861 # Sort operations 7862 operations_help.sort() 7863 7864 # insert header 7865 operations_help.insert(0, "Available calculation operations:") 7866 7867 # Return 7868 return operations_help 7869 7870 def calculation( 7871 self, 7872 operations: dict = {}, 7873 operations_config_dict: dict = {}, 7874 operations_config_file: str = None, 7875 ) -> None: 7876 """ 7877 It takes a list of operations, and for each operation, it checks if it's a python or sql 7878 operation, and then calls the appropriate function 7879 7880 param json example: 7881 "calculation": { 7882 "NOMEN": { 7883 "options": { 7884 "hgvs_field": "hgvs" 7885 }, 7886 "middle" : null 7887 } 7888 """ 7889 7890 # Param 7891 param = self.get_param() 7892 7893 # operations config 7894 operations_config = self.get_config_json( 7895 name="calculations", 7896 config_dict=operations_config_dict, 7897 config_file=operations_config_file, 7898 ) 7899 7900 # Upper keys 7901 operations_config = {k.upper(): v for k, v in operations_config.items()} 7902 7903 # Calculations 7904 7905 # Operations from param 7906 operations = param.get("calculation", {}).get("calculations", operations) 7907 7908 # Quick calculation - add 7909 if param.get("calculations", None): 7910 calculations_list = [ 7911 value for value in param.get("calculations", "").split(",") 7912 ] 7913 log.info(f"Quick Calculations:") 7914 for calculation_key in calculations_list: 7915 log.info(f" {calculation_key}") 7916 for calculation_operation in calculations_list: 7917 if calculation_operation.upper() not in operations: 7918 operations[calculation_operation.upper()] = {} 7919 add_value_into_dict( 7920 dict_tree=param, 7921 sections=[ 7922 "calculation", 7923 "calculations", 7924 calculation_operation.upper(), 7925 ], 7926 value={}, 7927 ) 7928 7929 # Operations for calculation 7930 if not operations: 7931 operations = param.get("calculation", {}).get("calculations", {}) 7932 7933 if operations: 7934 log.info(f"Calculations...") 7935 7936 # For each operations 7937 for operation_name in operations: 7938 operation_name = operation_name.upper() 7939 if operation_name not in [""]: 7940 if operation_name in operations_config: 7941 log.info(f"Calculation '{operation_name}'") 7942 operation = operations_config[operation_name] 7943 operation_type = operation.get("type", "sql") 7944 if operation_type == "python": 7945 self.calculation_process_function( 7946 operation=operation, operation_name=operation_name 7947 ) 7948 elif operation_type == "sql": 7949 self.calculation_process_sql( 7950 operation=operation, operation_name=operation_name 7951 ) 7952 else: 7953 log.error( 7954 f"Operations config: Type '{operation_type}' NOT available" 7955 ) 7956 raise ValueError( 7957 f"Operations config: Type '{operation_type}' NOT available" 7958 ) 7959 else: 7960 log.error( 7961 f"Operations config: Calculation '{operation_name}' NOT available" 7962 ) 7963 raise ValueError( 7964 f"Operations config: Calculation '{operation_name}' NOT available" 7965 ) 7966 7967 # Explode INFOS fields into table fields 7968 if self.get_explode_infos(): 7969 self.explode_infos( 7970 prefix=self.get_explode_infos_prefix(), 7971 fields=self.get_explode_infos_fields(), 7972 force=True, 7973 ) 7974 7975 def calculation_process_sql( 7976 self, operation: dict, operation_name: str = "unknown" 7977 ) -> None: 7978 """ 7979 The `calculation_process_sql` function takes in a mathematical operation as a string and 7980 performs the operation, updating the specified table with the result. 7981 7982 :param operation: The `operation` parameter is a dictionary that contains information about the 7983 mathematical operation to be performed. It includes the following keys: 7984 :type operation: dict 7985 :param operation_name: The `operation_name` parameter is a string that represents the name of 7986 the mathematical operation being performed. It is used for logging and error handling purposes, 7987 defaults to unknown 7988 :type operation_name: str (optional) 7989 """ 7990 7991 # table variants 7992 table_variants = self.get_table_variants(clause="alter") 7993 7994 # Operation infos 7995 operation_name = operation.get("name", "unknown") 7996 log.debug(f"process sql {operation_name}") 7997 output_column_name = operation.get("output_column_name", operation_name) 7998 output_column_type = operation.get("output_column_type", "String") 7999 prefix = operation.get("explode_infos_prefix", "") 8000 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8001 output_column_description = operation.get( 8002 "output_column_description", f"{operation_name} operation" 8003 ) 8004 operation_query = operation.get("operation_query", None) 8005 if isinstance(operation_query, list): 8006 operation_query = " ".join(operation_query) 8007 operation_info_fields = operation.get("info_fields", []) 8008 operation_info_fields_check = operation.get("info_fields_check", False) 8009 operation_info = operation.get("operation_info", True) 8010 8011 if operation_query: 8012 8013 # Info fields check 8014 operation_info_fields_check_result = True 8015 if operation_info_fields_check: 8016 header_infos = self.get_header().infos 8017 for info_field in operation_info_fields: 8018 operation_info_fields_check_result = ( 8019 operation_info_fields_check_result 8020 and info_field in header_infos 8021 ) 8022 8023 # If info fields available 8024 if operation_info_fields_check_result: 8025 8026 # Added_columns 8027 added_columns = [] 8028 8029 # Create VCF header field 8030 vcf_reader = self.get_header() 8031 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8032 output_column_name, 8033 ".", 8034 output_column_type, 8035 output_column_description, 8036 "howard calculation", 8037 "0", 8038 self.code_type_map.get(output_column_type), 8039 ) 8040 8041 # Explode infos if needed 8042 log.debug(f"calculation_process_sql prefix {prefix}") 8043 added_columns += self.explode_infos( 8044 prefix=prefix, 8045 fields=[output_column_name] + operation_info_fields, 8046 force=True, 8047 ) 8048 8049 # Create column 8050 added_column = self.add_column( 8051 table_name=table_variants, 8052 column_name=prefix + output_column_name, 8053 column_type=output_column_type_sql, 8054 default_value="null", 8055 ) 8056 added_columns.append(added_column) 8057 8058 # Operation calculation 8059 try: 8060 8061 # Query to update calculation column 8062 sql_update = f""" 8063 UPDATE {table_variants} 8064 SET "{prefix}{output_column_name}" = ({operation_query}) 8065 """ 8066 self.conn.execute(sql_update) 8067 8068 # Add to INFO 8069 if operation_info: 8070 sql_update_info = f""" 8071 UPDATE {table_variants} 8072 SET "INFO" = 8073 concat( 8074 CASE 8075 WHEN "INFO" IS NOT NULL 8076 THEN concat("INFO", ';') 8077 ELSE '' 8078 END, 8079 '{output_column_name}=', 8080 "{prefix}{output_column_name}" 8081 ) 8082 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8083 """ 8084 self.conn.execute(sql_update_info) 8085 8086 except: 8087 log.error( 8088 f"Operations config: Calculation '{operation_name}' query failed" 8089 ) 8090 raise ValueError( 8091 f"Operations config: Calculation '{operation_name}' query failed" 8092 ) 8093 8094 # Remove added columns 8095 for added_column in added_columns: 8096 log.debug(f"added_column: {added_column}") 8097 self.drop_column(column=added_column) 8098 8099 else: 8100 log.error( 8101 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8102 ) 8103 raise ValueError( 8104 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8105 ) 8106 8107 else: 8108 log.error( 8109 f"Operations config: Calculation '{operation_name}' query NOT defined" 8110 ) 8111 raise ValueError( 8112 f"Operations config: Calculation '{operation_name}' query NOT defined" 8113 ) 8114 8115 def calculation_process_function( 8116 self, operation: dict, operation_name: str = "unknown" 8117 ) -> None: 8118 """ 8119 The `calculation_process_function` takes in an operation dictionary and performs the specified 8120 function with the given parameters. 8121 8122 :param operation: The `operation` parameter is a dictionary that contains information about the 8123 operation to be performed. It has the following keys: 8124 :type operation: dict 8125 :param operation_name: The `operation_name` parameter is a string that represents the name of 8126 the operation being performed. It is used for logging purposes, defaults to unknown 8127 :type operation_name: str (optional) 8128 """ 8129 8130 operation_name = operation["name"] 8131 log.debug(f"process sql {operation_name}") 8132 function_name = operation["function_name"] 8133 function_params = operation["function_params"] 8134 getattr(self, function_name)(*function_params) 8135 8136 def calculation_variant_id(self) -> None: 8137 """ 8138 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8139 updates the INFO field of a variants table with the variant ID. 8140 """ 8141 8142 # variant_id annotation field 8143 variant_id_tag = self.get_variant_id_column() 8144 added_columns = [variant_id_tag] 8145 8146 # variant_id hgvs tags" 8147 vcf_infos_tags = { 8148 variant_id_tag: "howard variant ID annotation", 8149 } 8150 8151 # Variants table 8152 table_variants = self.get_table_variants() 8153 8154 # Header 8155 vcf_reader = self.get_header() 8156 8157 # Add variant_id to header 8158 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8159 variant_id_tag, 8160 ".", 8161 "String", 8162 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8163 "howard calculation", 8164 "0", 8165 self.code_type_map.get("String"), 8166 ) 8167 8168 # Update 8169 sql_update = f""" 8170 UPDATE {table_variants} 8171 SET "INFO" = 8172 concat( 8173 CASE 8174 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8175 THEN '' 8176 ELSE concat("INFO", ';') 8177 END, 8178 '{variant_id_tag}=', 8179 "{variant_id_tag}" 8180 ) 8181 """ 8182 self.conn.execute(sql_update) 8183 8184 # Remove added columns 8185 for added_column in added_columns: 8186 self.drop_column(column=added_column) 8187 8188 def calculation_extract_snpeff_hgvs( 8189 self, 8190 snpeff_hgvs: str = "snpeff_hgvs", 8191 snpeff_field: str = "ANN", 8192 ) -> None: 8193 """ 8194 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8195 annotation field in a VCF file and adds them as a new column in the variants table. 8196 8197 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8198 function is used to specify the name of the column that will store the HGVS nomenclatures 8199 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8200 snpeff_hgvs 8201 :type snpeff_hgvs: str (optional) 8202 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8203 function represents the field in the VCF file that contains SnpEff annotations. This field is 8204 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8205 to ANN 8206 :type snpeff_field: str (optional) 8207 """ 8208 8209 # Snpeff hgvs tags 8210 vcf_infos_tags = { 8211 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8212 } 8213 8214 # Prefix 8215 prefix = self.get_explode_infos_prefix() 8216 if prefix: 8217 prefix = "INFO/" 8218 8219 # snpEff fields 8220 speff_ann_infos = prefix + snpeff_field 8221 speff_hgvs_infos = prefix + snpeff_hgvs 8222 8223 # Variants table 8224 table_variants = self.get_table_variants() 8225 8226 # Header 8227 vcf_reader = self.get_header() 8228 8229 # Add columns 8230 added_columns = [] 8231 8232 # Explode HGVS field in column 8233 added_columns += self.explode_infos(fields=[snpeff_field]) 8234 8235 if snpeff_field in vcf_reader.infos: 8236 8237 log.debug(vcf_reader.infos[snpeff_field]) 8238 8239 # Extract ANN header 8240 ann_description = vcf_reader.infos[snpeff_field].desc 8241 pattern = r"'(.+?)'" 8242 match = re.search(pattern, ann_description) 8243 if match: 8244 ann_header_match = match.group(1).split(" | ") 8245 ann_header_desc = {} 8246 for i in range(len(ann_header_match)): 8247 ann_header_info = "".join( 8248 char for char in ann_header_match[i] if char.isalnum() 8249 ) 8250 ann_header_desc[ann_header_info] = ann_header_match[i] 8251 if not ann_header_desc: 8252 raise ValueError("Invalid header description format") 8253 else: 8254 raise ValueError("Invalid header description format") 8255 8256 # Create variant id 8257 variant_id_column = self.get_variant_id_column() 8258 added_columns += [variant_id_column] 8259 8260 # Create dataframe 8261 dataframe_snpeff_hgvs = self.get_query_to_df( 8262 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8263 ) 8264 8265 # Create main NOMEN column 8266 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8267 speff_ann_infos 8268 ].apply( 8269 lambda x: extract_snpeff_hgvs( 8270 str(x), header=list(ann_header_desc.values()) 8271 ) 8272 ) 8273 8274 # Add snpeff_hgvs to header 8275 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8276 snpeff_hgvs, 8277 ".", 8278 "String", 8279 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8280 "howard calculation", 8281 "0", 8282 self.code_type_map.get("String"), 8283 ) 8284 8285 # Update 8286 sql_update = f""" 8287 UPDATE variants 8288 SET "INFO" = 8289 concat( 8290 CASE 8291 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8292 THEN '' 8293 ELSE concat("INFO", ';') 8294 END, 8295 CASE 8296 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8297 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8298 THEN concat( 8299 '{snpeff_hgvs}=', 8300 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8301 ) 8302 ELSE '' 8303 END 8304 ) 8305 FROM dataframe_snpeff_hgvs 8306 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8307 8308 """ 8309 self.conn.execute(sql_update) 8310 8311 # Delete dataframe 8312 del dataframe_snpeff_hgvs 8313 gc.collect() 8314 8315 else: 8316 8317 log.warning( 8318 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8319 ) 8320 8321 # Remove added columns 8322 for added_column in added_columns: 8323 self.drop_column(column=added_column) 8324 8325 def calculation_snpeff_ann_explode( 8326 self, 8327 uniquify: bool = True, 8328 output_format: str = "fields", 8329 output_prefix: str = "snpeff_", 8330 snpeff_field: str = "ANN", 8331 ) -> None: 8332 """ 8333 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8334 exploding the HGVS field and updating variant information accordingly. 8335 8336 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8337 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8338 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8339 defaults to True 8340 :type uniquify: bool (optional) 8341 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8342 function specifies the format in which the output annotations will be generated. It has a 8343 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8344 format, defaults to fields 8345 :type output_format: str (optional) 8346 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8347 method is used to specify the prefix that will be added to the output annotations generated 8348 during the calculation process. This prefix helps to differentiate the newly added annotations 8349 from existing ones in the output data. By default, the, defaults to ANN_ 8350 :type output_prefix: str (optional) 8351 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8352 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8353 field will be processed to explode the HGVS annotations and update the variant information 8354 accordingly, defaults to ANN 8355 :type snpeff_field: str (optional) 8356 """ 8357 8358 # SnpEff annotation field 8359 snpeff_hgvs = "snpeff_ann_explode" 8360 8361 # Snpeff hgvs tags 8362 vcf_infos_tags = { 8363 snpeff_hgvs: "Explode snpEff annotations", 8364 } 8365 8366 # Prefix 8367 prefix = self.get_explode_infos_prefix() 8368 if prefix: 8369 prefix = "INFO/" 8370 8371 # snpEff fields 8372 speff_ann_infos = prefix + snpeff_field 8373 speff_hgvs_infos = prefix + snpeff_hgvs 8374 8375 # Variants table 8376 table_variants = self.get_table_variants() 8377 8378 # Header 8379 vcf_reader = self.get_header() 8380 8381 # Add columns 8382 added_columns = [] 8383 8384 # Explode HGVS field in column 8385 added_columns += self.explode_infos(fields=[snpeff_field]) 8386 log.debug(f"snpeff_field={snpeff_field}") 8387 log.debug(f"added_columns={added_columns}") 8388 8389 if snpeff_field in vcf_reader.infos: 8390 8391 # Extract ANN header 8392 ann_description = vcf_reader.infos[snpeff_field].desc 8393 pattern = r"'(.+?)'" 8394 match = re.search(pattern, ann_description) 8395 if match: 8396 ann_header_match = match.group(1).split(" | ") 8397 ann_header = [] 8398 ann_header_desc = {} 8399 for i in range(len(ann_header_match)): 8400 ann_header_info = "".join( 8401 char for char in ann_header_match[i] if char.isalnum() 8402 ) 8403 ann_header.append(ann_header_info) 8404 ann_header_desc[ann_header_info] = ann_header_match[i] 8405 if not ann_header_desc: 8406 raise ValueError("Invalid header description format") 8407 else: 8408 raise ValueError("Invalid header description format") 8409 8410 # Create variant id 8411 variant_id_column = self.get_variant_id_column() 8412 added_columns += [variant_id_column] 8413 8414 # Create dataframe 8415 dataframe_snpeff_hgvs = self.get_query_to_df( 8416 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8417 ) 8418 8419 # Create snpEff columns 8420 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8421 speff_ann_infos 8422 ].apply( 8423 lambda x: explode_snpeff_ann( 8424 str(x), 8425 uniquify=uniquify, 8426 output_format=output_format, 8427 prefix=output_prefix, 8428 header=list(ann_header_desc.values()), 8429 ) 8430 ) 8431 8432 # Header 8433 ann_annotations_prefix = "" 8434 if output_format.upper() in ["JSON"]: 8435 ann_annotations_prefix = f"{output_prefix}=" 8436 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8437 output_prefix, 8438 ".", 8439 "String", 8440 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8441 + " - JSON format", 8442 "howard calculation", 8443 "0", 8444 self.code_type_map.get("String"), 8445 ) 8446 else: 8447 for ann_annotation in ann_header: 8448 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8449 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8450 ann_annotation_id, 8451 ".", 8452 "String", 8453 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8454 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8455 "howard calculation", 8456 "0", 8457 self.code_type_map.get("String"), 8458 ) 8459 8460 # Update 8461 sql_update = f""" 8462 UPDATE variants 8463 SET "INFO" = 8464 concat( 8465 CASE 8466 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8467 THEN '' 8468 ELSE concat("INFO", ';') 8469 END, 8470 CASE 8471 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8472 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8473 THEN concat( 8474 '{ann_annotations_prefix}', 8475 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8476 ) 8477 ELSE '' 8478 END 8479 ) 8480 FROM dataframe_snpeff_hgvs 8481 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8482 8483 """ 8484 self.conn.execute(sql_update) 8485 8486 # Delete dataframe 8487 del dataframe_snpeff_hgvs 8488 gc.collect() 8489 8490 else: 8491 8492 log.warning( 8493 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8494 ) 8495 8496 # Remove added columns 8497 for added_column in added_columns: 8498 self.drop_column(column=added_column) 8499 8500 def calculation_extract_nomen(self) -> None: 8501 """ 8502 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8503 """ 8504 8505 # NOMEN field 8506 field_nomen_dict = "NOMEN_DICT" 8507 8508 # NOMEN structure 8509 nomen_dict = { 8510 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8511 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8512 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8513 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8514 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8515 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8516 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8517 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8518 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8519 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8520 } 8521 8522 # Param 8523 param = self.get_param() 8524 8525 # Prefix 8526 prefix = self.get_explode_infos_prefix() 8527 8528 # Header 8529 vcf_reader = self.get_header() 8530 8531 # Get HGVS field 8532 hgvs_field = ( 8533 param.get("calculation", {}) 8534 .get("calculations", {}) 8535 .get("NOMEN", {}) 8536 .get("options", {}) 8537 .get("hgvs_field", "hgvs") 8538 ) 8539 8540 # Get transcripts 8541 transcripts_file = ( 8542 param.get("calculation", {}) 8543 .get("calculations", {}) 8544 .get("NOMEN", {}) 8545 .get("options", {}) 8546 .get("transcripts", None) 8547 ) 8548 transcripts_file = full_path(transcripts_file) 8549 transcripts = [] 8550 if transcripts_file: 8551 if os.path.exists(transcripts_file): 8552 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8553 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8554 else: 8555 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8556 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8557 8558 # Added columns 8559 added_columns = [] 8560 8561 # Explode HGVS field in column 8562 added_columns += self.explode_infos(fields=[hgvs_field]) 8563 8564 # extra infos 8565 extra_infos = self.get_extra_infos() 8566 extra_field = prefix + hgvs_field 8567 8568 if extra_field in extra_infos: 8569 8570 # Create dataframe 8571 dataframe_hgvs = self.get_query_to_df( 8572 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8573 ) 8574 8575 # Create main NOMEN column 8576 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8577 lambda x: find_nomen(str(x), transcripts=transcripts) 8578 ) 8579 8580 # Explode NOMEN Structure and create SQL set for update 8581 sql_nomen_fields = [] 8582 for nomen_field in nomen_dict: 8583 8584 # Explode each field into a column 8585 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8586 lambda x: dict(x).get(nomen_field, "") 8587 ) 8588 8589 # Create VCF header field 8590 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8591 nomen_field, 8592 ".", 8593 "String", 8594 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8595 "howard calculation", 8596 "0", 8597 self.code_type_map.get("String"), 8598 ) 8599 sql_nomen_fields.append( 8600 f""" 8601 CASE 8602 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8603 THEN concat( 8604 ';{nomen_field}=', 8605 dataframe_hgvs."{nomen_field}" 8606 ) 8607 ELSE '' 8608 END 8609 """ 8610 ) 8611 8612 # SQL set for update 8613 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8614 8615 # Update 8616 sql_update = f""" 8617 UPDATE variants 8618 SET "INFO" = 8619 concat( 8620 CASE 8621 WHEN "INFO" IS NULL 8622 THEN '' 8623 ELSE "INFO" 8624 END, 8625 {sql_nomen_fields_set} 8626 ) 8627 FROM dataframe_hgvs 8628 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8629 AND variants."POS" = dataframe_hgvs."POS" 8630 AND variants."REF" = dataframe_hgvs."REF" 8631 AND variants."ALT" = dataframe_hgvs."ALT" 8632 """ 8633 self.conn.execute(sql_update) 8634 8635 # Delete dataframe 8636 del dataframe_hgvs 8637 gc.collect() 8638 8639 # Remove added columns 8640 for added_column in added_columns: 8641 self.drop_column(column=added_column) 8642 8643 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8644 """ 8645 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8646 pipeline/sample for a variant and updates the variant information in a VCF file. 8647 8648 :param tag: The `tag` parameter is a string that represents the annotation field for the 8649 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8650 VCF header and to update the corresponding field in the variants table, defaults to 8651 findbypipeline 8652 :type tag: str (optional) 8653 """ 8654 8655 # if FORMAT and samples 8656 if ( 8657 "FORMAT" in self.get_header_columns_as_list() 8658 and self.get_header_sample_list() 8659 ): 8660 8661 # findbypipeline annotation field 8662 findbypipeline_tag = tag 8663 8664 # VCF infos tags 8665 vcf_infos_tags = { 8666 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8667 } 8668 8669 # Prefix 8670 prefix = self.get_explode_infos_prefix() 8671 8672 # Field 8673 findbypipeline_infos = prefix + findbypipeline_tag 8674 8675 # Variants table 8676 table_variants = self.get_table_variants() 8677 8678 # Header 8679 vcf_reader = self.get_header() 8680 8681 # Create variant id 8682 variant_id_column = self.get_variant_id_column() 8683 added_columns = [variant_id_column] 8684 8685 # variant_id, FORMAT and samples 8686 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8687 self.get_header_sample_list() 8688 ) 8689 8690 # Create dataframe 8691 dataframe_findbypipeline = self.get_query_to_df( 8692 f""" SELECT {samples_fields} FROM {table_variants} """ 8693 ) 8694 8695 # Create findbypipeline column 8696 dataframe_findbypipeline[findbypipeline_infos] = ( 8697 dataframe_findbypipeline.apply( 8698 lambda row: findbypipeline( 8699 row, samples=self.get_header_sample_list() 8700 ), 8701 axis=1, 8702 ) 8703 ) 8704 8705 # Add snpeff_hgvs to header 8706 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8707 findbypipeline_tag, 8708 ".", 8709 "String", 8710 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8711 "howard calculation", 8712 "0", 8713 self.code_type_map.get("String"), 8714 ) 8715 8716 # Update 8717 sql_update = f""" 8718 UPDATE variants 8719 SET "INFO" = 8720 concat( 8721 CASE 8722 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8723 THEN '' 8724 ELSE concat("INFO", ';') 8725 END, 8726 CASE 8727 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8728 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8729 THEN concat( 8730 '{findbypipeline_tag}=', 8731 dataframe_findbypipeline."{findbypipeline_infos}" 8732 ) 8733 ELSE '' 8734 END 8735 ) 8736 FROM dataframe_findbypipeline 8737 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8738 """ 8739 self.conn.execute(sql_update) 8740 8741 # Remove added columns 8742 for added_column in added_columns: 8743 self.drop_column(column=added_column) 8744 8745 # Delete dataframe 8746 del dataframe_findbypipeline 8747 gc.collect() 8748 8749 def calculation_genotype_concordance(self) -> None: 8750 """ 8751 The function `calculation_genotype_concordance` calculates the genotype concordance for 8752 multi-caller VCF files and updates the variant information in the database. 8753 """ 8754 8755 # if FORMAT and samples 8756 if ( 8757 "FORMAT" in self.get_header_columns_as_list() 8758 and self.get_header_sample_list() 8759 ): 8760 8761 # genotypeconcordance annotation field 8762 genotypeconcordance_tag = "genotypeconcordance" 8763 8764 # VCF infos tags 8765 vcf_infos_tags = { 8766 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8767 } 8768 8769 # Prefix 8770 prefix = self.get_explode_infos_prefix() 8771 8772 # Field 8773 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8774 8775 # Variants table 8776 table_variants = self.get_table_variants() 8777 8778 # Header 8779 vcf_reader = self.get_header() 8780 8781 # Create variant id 8782 variant_id_column = self.get_variant_id_column() 8783 added_columns = [variant_id_column] 8784 8785 # variant_id, FORMAT and samples 8786 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8787 self.get_header_sample_list() 8788 ) 8789 8790 # Create dataframe 8791 dataframe_genotypeconcordance = self.get_query_to_df( 8792 f""" SELECT {samples_fields} FROM {table_variants} """ 8793 ) 8794 8795 # Create genotypeconcordance column 8796 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8797 dataframe_genotypeconcordance.apply( 8798 lambda row: genotypeconcordance( 8799 row, samples=self.get_header_sample_list() 8800 ), 8801 axis=1, 8802 ) 8803 ) 8804 8805 # Add genotypeconcordance to header 8806 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8807 genotypeconcordance_tag, 8808 ".", 8809 "String", 8810 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8811 "howard calculation", 8812 "0", 8813 self.code_type_map.get("String"), 8814 ) 8815 8816 # Update 8817 sql_update = f""" 8818 UPDATE variants 8819 SET "INFO" = 8820 concat( 8821 CASE 8822 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8823 THEN '' 8824 ELSE concat("INFO", ';') 8825 END, 8826 CASE 8827 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8828 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8829 THEN concat( 8830 '{genotypeconcordance_tag}=', 8831 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8832 ) 8833 ELSE '' 8834 END 8835 ) 8836 FROM dataframe_genotypeconcordance 8837 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8838 """ 8839 self.conn.execute(sql_update) 8840 8841 # Remove added columns 8842 for added_column in added_columns: 8843 self.drop_column(column=added_column) 8844 8845 # Delete dataframe 8846 del dataframe_genotypeconcordance 8847 gc.collect() 8848 8849 def calculation_barcode(self, tag: str = "barcode") -> None: 8850 """ 8851 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8852 updates the INFO field in the file with the calculated barcode values. 8853 8854 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8855 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8856 the default tag name is set to "barcode", defaults to barcode 8857 :type tag: str (optional) 8858 """ 8859 8860 # if FORMAT and samples 8861 if ( 8862 "FORMAT" in self.get_header_columns_as_list() 8863 and self.get_header_sample_list() 8864 ): 8865 8866 # barcode annotation field 8867 if not tag: 8868 tag = "barcode" 8869 8870 # VCF infos tags 8871 vcf_infos_tags = { 8872 tag: "barcode calculation (VaRank)", 8873 } 8874 8875 # Prefix 8876 prefix = self.get_explode_infos_prefix() 8877 8878 # Field 8879 barcode_infos = prefix + tag 8880 8881 # Variants table 8882 table_variants = self.get_table_variants() 8883 8884 # Header 8885 vcf_reader = self.get_header() 8886 8887 # Create variant id 8888 variant_id_column = self.get_variant_id_column() 8889 added_columns = [variant_id_column] 8890 8891 # variant_id, FORMAT and samples 8892 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8893 self.get_header_sample_list() 8894 ) 8895 8896 # Create dataframe 8897 dataframe_barcode = self.get_query_to_df( 8898 f""" SELECT {samples_fields} FROM {table_variants} """ 8899 ) 8900 8901 # Create barcode column 8902 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8903 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8904 ) 8905 8906 # Add barcode to header 8907 vcf_reader.infos[tag] = vcf.parser._Info( 8908 tag, 8909 ".", 8910 "String", 8911 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8912 "howard calculation", 8913 "0", 8914 self.code_type_map.get("String"), 8915 ) 8916 8917 # Update 8918 sql_update = f""" 8919 UPDATE {table_variants} 8920 SET "INFO" = 8921 concat( 8922 CASE 8923 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8924 THEN '' 8925 ELSE concat("INFO", ';') 8926 END, 8927 CASE 8928 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8929 AND dataframe_barcode."{barcode_infos}" NOT NULL 8930 THEN concat( 8931 '{tag}=', 8932 dataframe_barcode."{barcode_infos}" 8933 ) 8934 ELSE '' 8935 END 8936 ) 8937 FROM dataframe_barcode 8938 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8939 """ 8940 self.conn.execute(sql_update) 8941 8942 # Remove added columns 8943 for added_column in added_columns: 8944 self.drop_column(column=added_column) 8945 8946 # Delete dataframe 8947 del dataframe_barcode 8948 gc.collect() 8949 8950 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8951 """ 8952 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8953 and updates the INFO field in the file with the calculated barcode values. 8954 8955 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8956 the barcode tag that will be added to the VCF file during the calculation process. If no value 8957 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8958 :type tag: str (optional) 8959 """ 8960 8961 # if FORMAT and samples 8962 if ( 8963 "FORMAT" in self.get_header_columns_as_list() 8964 and self.get_header_sample_list() 8965 ): 8966 8967 # barcode annotation field 8968 if not tag: 8969 tag = "BCF" 8970 8971 # VCF infos tags 8972 vcf_infos_tags = { 8973 tag: "barcode family calculation", 8974 f"{tag}S": "barcode family samples", 8975 } 8976 8977 # Param 8978 param = self.get_param() 8979 log.debug(f"param={param}") 8980 8981 # Prefix 8982 prefix = self.get_explode_infos_prefix() 8983 8984 # PED param 8985 ped = ( 8986 param.get("calculation", {}) 8987 .get("calculations", {}) 8988 .get("BARCODEFAMILY", {}) 8989 .get("family_pedigree", None) 8990 ) 8991 log.debug(f"ped={ped}") 8992 8993 # Load PED 8994 if ped: 8995 8996 # Pedigree is a file 8997 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8998 log.debug("Pedigree is file") 8999 with open(full_path(ped)) as ped: 9000 ped = json.load(ped) 9001 9002 # Pedigree is a string 9003 elif isinstance(ped, str): 9004 log.debug("Pedigree is str") 9005 try: 9006 ped = json.loads(ped) 9007 log.debug("Pedigree is json str") 9008 except ValueError as e: 9009 ped_samples = ped.split(",") 9010 ped = {} 9011 for ped_sample in ped_samples: 9012 ped[ped_sample] = ped_sample 9013 9014 # Pedigree is a dict 9015 elif isinstance(ped, dict): 9016 log.debug("Pedigree is dict") 9017 9018 # Pedigree is not well formatted 9019 else: 9020 msg_error = "Pedigree not well formatted" 9021 log.error(msg_error) 9022 raise ValueError(msg_error) 9023 9024 # Construct list 9025 ped_samples = list(ped.values()) 9026 9027 else: 9028 log.debug("Pedigree not defined. Take all samples") 9029 ped_samples = self.get_header_sample_list() 9030 ped = {} 9031 for ped_sample in ped_samples: 9032 ped[ped_sample] = ped_sample 9033 9034 # Check pedigree 9035 if not ped or len(ped) == 0: 9036 msg_error = f"Error in pedigree: samples {ped_samples}" 9037 log.error(msg_error) 9038 raise ValueError(msg_error) 9039 9040 # Log 9041 log.info( 9042 "Calculation 'BARCODEFAMILY' - Samples: " 9043 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9044 ) 9045 log.debug(f"ped_samples={ped_samples}") 9046 9047 # Field 9048 barcode_infos = prefix + tag 9049 9050 # Variants table 9051 table_variants = self.get_table_variants() 9052 9053 # Header 9054 vcf_reader = self.get_header() 9055 9056 # Create variant id 9057 variant_id_column = self.get_variant_id_column() 9058 added_columns = [variant_id_column] 9059 9060 # variant_id, FORMAT and samples 9061 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9062 ped_samples 9063 ) 9064 9065 # Create dataframe 9066 dataframe_barcode = self.get_query_to_df( 9067 f""" SELECT {samples_fields} FROM {table_variants} """ 9068 ) 9069 9070 # Create barcode column 9071 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9072 lambda row: barcode(row, samples=ped_samples), axis=1 9073 ) 9074 9075 # Add barcode family to header 9076 # Add vaf_normalization to header 9077 vcf_reader.formats[tag] = vcf.parser._Format( 9078 id=tag, 9079 num=".", 9080 type="String", 9081 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9082 type_code=self.code_type_map.get("String"), 9083 ) 9084 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9085 id=f"{tag}S", 9086 num=".", 9087 type="String", 9088 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9089 type_code=self.code_type_map.get("String"), 9090 ) 9091 9092 # Update 9093 # for sample in ped_samples: 9094 sql_update_set = [] 9095 for sample in self.get_header_sample_list() + ["FORMAT"]: 9096 if sample in ped_samples: 9097 value = f'dataframe_barcode."{barcode_infos}"' 9098 value_samples = "'" + ",".join(ped_samples) + "'" 9099 elif sample == "FORMAT": 9100 value = f"'{tag}'" 9101 value_samples = f"'{tag}S'" 9102 else: 9103 value = "'.'" 9104 value_samples = "'.'" 9105 format_regex = r"[a-zA-Z0-9\s]" 9106 sql_update_set.append( 9107 f""" 9108 "{sample}" = 9109 concat( 9110 CASE 9111 WHEN {table_variants}."{sample}" = './.' 9112 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9113 ELSE {table_variants}."{sample}" 9114 END, 9115 ':', 9116 {value}, 9117 ':', 9118 {value_samples} 9119 ) 9120 """ 9121 ) 9122 9123 sql_update_set_join = ", ".join(sql_update_set) 9124 sql_update = f""" 9125 UPDATE {table_variants} 9126 SET {sql_update_set_join} 9127 FROM dataframe_barcode 9128 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9129 """ 9130 self.conn.execute(sql_update) 9131 9132 # Remove added columns 9133 for added_column in added_columns: 9134 self.drop_column(column=added_column) 9135 9136 # Delete dataframe 9137 del dataframe_barcode 9138 gc.collect() 9139 9140 def calculation_trio(self) -> None: 9141 """ 9142 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9143 information to the INFO field of each variant. 9144 """ 9145 9146 # if FORMAT and samples 9147 if ( 9148 "FORMAT" in self.get_header_columns_as_list() 9149 and self.get_header_sample_list() 9150 ): 9151 9152 # trio annotation field 9153 trio_tag = "trio" 9154 9155 # VCF infos tags 9156 vcf_infos_tags = { 9157 "trio": "trio calculation", 9158 } 9159 9160 # Param 9161 param = self.get_param() 9162 9163 # Prefix 9164 prefix = self.get_explode_infos_prefix() 9165 9166 # Trio param 9167 trio_ped = ( 9168 param.get("calculation", {}) 9169 .get("calculations", {}) 9170 .get("TRIO", {}) 9171 .get("trio_pedigree", None) 9172 ) 9173 9174 # Load trio 9175 if trio_ped: 9176 9177 # Trio pedigree is a file 9178 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9179 log.debug("TRIO pedigree is file") 9180 with open(full_path(trio_ped)) as trio_ped: 9181 trio_ped = json.load(trio_ped) 9182 9183 # Trio pedigree is a string 9184 elif isinstance(trio_ped, str): 9185 log.debug("TRIO pedigree is str") 9186 try: 9187 trio_ped = json.loads(trio_ped) 9188 log.debug("TRIO pedigree is json str") 9189 except ValueError as e: 9190 trio_samples = trio_ped.split(",") 9191 if len(trio_samples) == 3: 9192 trio_ped = { 9193 "father": trio_samples[0], 9194 "mother": trio_samples[1], 9195 "child": trio_samples[2], 9196 } 9197 log.debug("TRIO pedigree is list str") 9198 else: 9199 msg_error = "TRIO pedigree not well formatted" 9200 log.error(msg_error) 9201 raise ValueError(msg_error) 9202 9203 # Trio pedigree is a dict 9204 elif isinstance(trio_ped, dict): 9205 log.debug("TRIO pedigree is dict") 9206 9207 # Trio pedigree is not well formatted 9208 else: 9209 msg_error = "TRIO pedigree not well formatted" 9210 log.error(msg_error) 9211 raise ValueError(msg_error) 9212 9213 # Construct trio list 9214 trio_samples = [ 9215 trio_ped.get("father", ""), 9216 trio_ped.get("mother", ""), 9217 trio_ped.get("child", ""), 9218 ] 9219 9220 else: 9221 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9222 samples_list = self.get_header_sample_list() 9223 if len(samples_list) >= 3: 9224 trio_samples = self.get_header_sample_list()[0:3] 9225 trio_ped = { 9226 "father": trio_samples[0], 9227 "mother": trio_samples[1], 9228 "child": trio_samples[2], 9229 } 9230 else: 9231 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9232 log.error(msg_error) 9233 raise ValueError(msg_error) 9234 9235 # Check trio pedigree 9236 if not trio_ped or len(trio_ped) != 3: 9237 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9238 log.error(msg_error) 9239 raise ValueError(msg_error) 9240 9241 # Log 9242 log.info( 9243 f"Calculation 'TRIO' - Samples: " 9244 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9245 ) 9246 9247 # Field 9248 trio_infos = prefix + trio_tag 9249 9250 # Variants table 9251 table_variants = self.get_table_variants() 9252 9253 # Header 9254 vcf_reader = self.get_header() 9255 9256 # Create variant id 9257 variant_id_column = self.get_variant_id_column() 9258 added_columns = [variant_id_column] 9259 9260 # variant_id, FORMAT and samples 9261 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9262 self.get_header_sample_list() 9263 ) 9264 9265 # Create dataframe 9266 dataframe_trio = self.get_query_to_df( 9267 f""" SELECT {samples_fields} FROM {table_variants} """ 9268 ) 9269 9270 # Create trio column 9271 dataframe_trio[trio_infos] = dataframe_trio.apply( 9272 lambda row: trio(row, samples=trio_samples), axis=1 9273 ) 9274 9275 # Add trio to header 9276 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9277 trio_tag, 9278 ".", 9279 "String", 9280 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9281 "howard calculation", 9282 "0", 9283 self.code_type_map.get("String"), 9284 ) 9285 9286 # Update 9287 sql_update = f""" 9288 UPDATE {table_variants} 9289 SET "INFO" = 9290 concat( 9291 CASE 9292 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9293 THEN '' 9294 ELSE concat("INFO", ';') 9295 END, 9296 CASE 9297 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9298 AND dataframe_trio."{trio_infos}" NOT NULL 9299 THEN concat( 9300 '{trio_tag}=', 9301 dataframe_trio."{trio_infos}" 9302 ) 9303 ELSE '' 9304 END 9305 ) 9306 FROM dataframe_trio 9307 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9308 """ 9309 self.conn.execute(sql_update) 9310 9311 # Remove added columns 9312 for added_column in added_columns: 9313 self.drop_column(column=added_column) 9314 9315 # Delete dataframe 9316 del dataframe_trio 9317 gc.collect() 9318 9319 def calculation_vaf_normalization(self) -> None: 9320 """ 9321 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9322 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9323 :return: The function does not return anything. 9324 """ 9325 9326 # if FORMAT and samples 9327 if ( 9328 "FORMAT" in self.get_header_columns_as_list() 9329 and self.get_header_sample_list() 9330 ): 9331 9332 # vaf_normalization annotation field 9333 vaf_normalization_tag = "VAF" 9334 9335 # VCF infos tags 9336 vcf_infos_tags = { 9337 "VAF": "VAF Variant Frequency", 9338 } 9339 9340 # Prefix 9341 prefix = self.get_explode_infos_prefix() 9342 9343 # Variants table 9344 table_variants = self.get_table_variants() 9345 9346 # Header 9347 vcf_reader = self.get_header() 9348 9349 # Do not calculate if VAF already exists 9350 if "VAF" in vcf_reader.formats: 9351 log.debug("VAF already on genotypes") 9352 return 9353 9354 # Create variant id 9355 variant_id_column = self.get_variant_id_column() 9356 added_columns = [variant_id_column] 9357 9358 # variant_id, FORMAT and samples 9359 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9360 f""" "{sample}" """ for sample in self.get_header_sample_list() 9361 ) 9362 9363 # Create dataframe 9364 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9365 log.debug(f"query={query}") 9366 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9367 9368 vaf_normalization_set = [] 9369 9370 # for each sample vaf_normalization 9371 for sample in self.get_header_sample_list(): 9372 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9373 lambda row: vaf_normalization(row, sample=sample), axis=1 9374 ) 9375 vaf_normalization_set.append( 9376 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9377 ) 9378 9379 # Add VAF to FORMAT 9380 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9381 "FORMAT" 9382 ].apply(lambda x: str(x) + ":VAF") 9383 vaf_normalization_set.append( 9384 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9385 ) 9386 9387 # Add vaf_normalization to header 9388 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9389 id=vaf_normalization_tag, 9390 num="1", 9391 type="Float", 9392 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9393 type_code=self.code_type_map.get("Float"), 9394 ) 9395 9396 # Create fields to add in INFO 9397 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9398 9399 # Update 9400 sql_update = f""" 9401 UPDATE {table_variants} 9402 SET {sql_vaf_normalization_set} 9403 FROM dataframe_vaf_normalization 9404 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9405 9406 """ 9407 self.conn.execute(sql_update) 9408 9409 # Remove added columns 9410 for added_column in added_columns: 9411 self.drop_column(column=added_column) 9412 9413 # Delete dataframe 9414 del dataframe_vaf_normalization 9415 gc.collect() 9416 9417 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9418 """ 9419 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9420 field in a VCF file and updates the INFO column of the variants table with the calculated 9421 statistics. 9422 9423 :param info: The `info` parameter is a string that represents the type of information for which 9424 genotype statistics are calculated. It is used to generate various VCF info tags for the 9425 statistics, such as the number of occurrences, the list of values, the minimum value, the 9426 maximum value, the mean, the median, defaults to VAF 9427 :type info: str (optional) 9428 """ 9429 9430 # if FORMAT and samples 9431 if ( 9432 "FORMAT" in self.get_header_columns_as_list() 9433 and self.get_header_sample_list() 9434 ): 9435 9436 # vaf_stats annotation field 9437 vaf_stats_tag = info + "_stats" 9438 9439 # VCF infos tags 9440 vcf_infos_tags = { 9441 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9442 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9443 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9444 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9445 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9446 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9447 info 9448 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9449 } 9450 9451 # Prefix 9452 prefix = self.get_explode_infos_prefix() 9453 9454 # Field 9455 vaf_stats_infos = prefix + vaf_stats_tag 9456 9457 # Variants table 9458 table_variants = self.get_table_variants() 9459 9460 # Header 9461 vcf_reader = self.get_header() 9462 9463 # Create variant id 9464 variant_id_column = self.get_variant_id_column() 9465 added_columns = [variant_id_column] 9466 9467 # variant_id, FORMAT and samples 9468 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9469 self.get_header_sample_list() 9470 ) 9471 9472 # Create dataframe 9473 dataframe_vaf_stats = self.get_query_to_df( 9474 f""" SELECT {samples_fields} FROM {table_variants} """ 9475 ) 9476 9477 # Create vaf_stats column 9478 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9479 lambda row: genotype_stats( 9480 row, samples=self.get_header_sample_list(), info=info 9481 ), 9482 axis=1, 9483 ) 9484 9485 # List of vcf tags 9486 sql_vaf_stats_fields = [] 9487 9488 # Check all VAF stats infos 9489 for stat in vcf_infos_tags: 9490 9491 # Extract stats 9492 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9493 lambda x: dict(x).get(stat, "") 9494 ) 9495 9496 # Add snpeff_hgvs to header 9497 vcf_reader.infos[stat] = vcf.parser._Info( 9498 stat, 9499 ".", 9500 "String", 9501 vcf_infos_tags.get(stat, "genotype statistics"), 9502 "howard calculation", 9503 "0", 9504 self.code_type_map.get("String"), 9505 ) 9506 9507 if len(sql_vaf_stats_fields): 9508 sep = ";" 9509 else: 9510 sep = "" 9511 9512 # Create fields to add in INFO 9513 sql_vaf_stats_fields.append( 9514 f""" 9515 CASE 9516 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9517 THEN concat( 9518 '{sep}{stat}=', 9519 dataframe_vaf_stats."{stat}" 9520 ) 9521 ELSE '' 9522 END 9523 """ 9524 ) 9525 9526 # SQL set for update 9527 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9528 9529 # Update 9530 sql_update = f""" 9531 UPDATE {table_variants} 9532 SET "INFO" = 9533 concat( 9534 CASE 9535 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9536 THEN '' 9537 ELSE concat("INFO", ';') 9538 END, 9539 {sql_vaf_stats_fields_set} 9540 ) 9541 FROM dataframe_vaf_stats 9542 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9543 9544 """ 9545 self.conn.execute(sql_update) 9546 9547 # Remove added columns 9548 for added_column in added_columns: 9549 self.drop_column(column=added_column) 9550 9551 # Delete dataframe 9552 del dataframe_vaf_stats 9553 gc.collect() 9554 9555 def calculation_transcripts_annotation( 9556 self, info_json: str = None, info_format: str = None 9557 ) -> None: 9558 """ 9559 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9560 field to it if transcripts are available. 9561 9562 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9563 is a string parameter that represents the information field to be used in the transcripts JSON. 9564 It is used to specify the JSON format for the transcripts information. If no value is provided 9565 when calling the method, it defaults to " 9566 :type info_json: str 9567 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9568 method is a string parameter that specifies the format of the information field to be used in 9569 the transcripts JSON. It is used to define the format of the information field 9570 :type info_format: str 9571 """ 9572 9573 # Create transcripts table 9574 transcripts_table = self.create_transcript_view() 9575 9576 # Add info field 9577 if transcripts_table: 9578 self.transcript_view_to_variants( 9579 transcripts_table=transcripts_table, 9580 transcripts_info_field_json=info_json, 9581 transcripts_info_field_format=info_format, 9582 ) 9583 else: 9584 log.info("No Transcripts to process. Check param.json file configuration") 9585 9586 def calculation_transcripts_prioritization(self) -> None: 9587 """ 9588 The function `calculation_transcripts_prioritization` creates a transcripts table and 9589 prioritizes transcripts based on certain criteria. 9590 """ 9591 9592 # Create transcripts table 9593 transcripts_table = self.create_transcript_view() 9594 9595 # Add info field 9596 if transcripts_table: 9597 self.transcripts_prioritization(transcripts_table=transcripts_table) 9598 else: 9599 log.info("No Transcripts to process. Check param.json file configuration") 9600 9601 ############### 9602 # Transcripts # 9603 ############### 9604 9605 def transcripts_prioritization( 9606 self, transcripts_table: str = None, param: dict = {} 9607 ) -> bool: 9608 """ 9609 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 9610 and updates the variants table with the prioritized information. 9611 9612 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 9613 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 9614 This parameter is used to identify the table where the transcripts data is stored for the 9615 prioritization process 9616 :type transcripts_table: str 9617 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 9618 that contains various configuration settings for the prioritization process of transcripts. It 9619 is used to customize the behavior of the prioritization algorithm and includes settings such as 9620 the prefix for prioritization fields, default profiles, and other 9621 :type param: dict 9622 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 9623 transcripts prioritization process is successfully completed, and `False` if there are any 9624 issues or if no profile is defined for transcripts prioritization. 9625 """ 9626 9627 log.debug("Start transcripts prioritization...") 9628 9629 # Param 9630 if not param: 9631 param = self.get_param() 9632 9633 # Variants table 9634 table_variants = self.get_table_variants() 9635 log.debug(f"transcripts_table={transcripts_table}") 9636 # Transcripts table 9637 if transcripts_table is None: 9638 log.debug(f"transcripts_table={transcripts_table}") 9639 transcripts_table = self.create_transcript_view( 9640 transcripts_table="transcripts", param=param 9641 ) 9642 log.debug(f"transcripts_table={transcripts_table}") 9643 if transcripts_table is None: 9644 msg_err = "No Transcripts table availalble" 9645 log.error(msg_err) 9646 raise ValueError(msg_err) 9647 9648 # Get transcripts columns 9649 columns_as_list_query = f""" 9650 DESCRIBE {transcripts_table} 9651 """ 9652 columns_as_list = list( 9653 self.get_query_to_df(columns_as_list_query)["column_name"] 9654 ) 9655 9656 # Create INFO if not exists 9657 if "INFO" not in columns_as_list: 9658 query_add_info = f""" 9659 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 9660 """ 9661 self.execute_query(query_add_info) 9662 9663 # Prioritization param and Force only PZ Score and Flag 9664 pz_param = param.get("transcripts", {}).get("prioritization", {}) 9665 pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score" 9666 pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag" 9667 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 9668 pz_param["pzfields"] = [pz_fields_score, pz_fields_flag] 9669 pz_profile_default = ( 9670 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 9671 ) 9672 9673 # Exit if no profile 9674 if pz_profile_default is None: 9675 log.warning("No profile defined for transcripts prioritization") 9676 return False 9677 9678 # Prioritization 9679 prioritization_result = self.prioritization( 9680 table=transcripts_table, 9681 pz_param=param.get("transcripts", {}).get("prioritization", {}), 9682 ) 9683 if not prioritization_result: 9684 log.warning("Transcripts prioritization not processed") 9685 return False 9686 9687 # Explode PZ fields 9688 self.explode_infos( 9689 table=transcripts_table, 9690 fields=param.get("transcripts", {}) 9691 .get("prioritization", {}) 9692 .get("pzfields", []), 9693 ) 9694 9695 # Export Transcripts prioritization infos to variants table 9696 query_update = f""" 9697 WITH RankedTranscripts AS ( 9698 SELECT 9699 "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag}, 9700 ROW_NUMBER() OVER ( 9701 PARTITION BY "#CHROM", POS, REF, ALT 9702 ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC 9703 ) AS rn 9704 FROM 9705 {transcripts_table} 9706 ) 9707 UPDATE {table_variants} 9708 SET 9709 INFO = CONCAT(CASE 9710 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9711 THEN '' 9712 ELSE concat("INFO", ';') 9713 END, 9714 concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag}) 9715 ) 9716 FROM 9717 RankedTranscripts 9718 WHERE 9719 rn = 1 9720 AND variants."#CHROM" = RankedTranscripts."#CHROM" 9721 AND variants."POS" = RankedTranscripts."POS" 9722 AND variants."REF" = RankedTranscripts."REF" 9723 AND variants."ALT" = RankedTranscripts."ALT" 9724 9725 """ 9726 self.execute_query(query=query_update) 9727 9728 # Add PZ Transcript in header 9729 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 9730 pz_fields_transcripts, 9731 ".", 9732 "String", 9733 f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}", 9734 "unknown", 9735 "unknown", 9736 code_type_map["String"], 9737 ) 9738 9739 # Return 9740 return True 9741 9742 def create_transcript_view_from_columns_map( 9743 self, 9744 transcripts_table: str = "transcripts", 9745 columns_maps: dict = {}, 9746 added_columns: list = [], 9747 temporary_tables: list = None, 9748 annotation_fields: list = None, 9749 ) -> tuple[list, list, list]: 9750 """ 9751 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9752 specified columns mapping for transcripts data. 9753 9754 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9755 the table where the transcripts data is stored or will be stored in the database. This table 9756 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9757 predictions, etc. It defaults to "transcripts, defaults to transcripts 9758 :type transcripts_table: str (optional) 9759 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9760 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9761 represents a mapping configuration for a specific set of columns. It typically includes details such 9762 as the main transcript column and additional information columns 9763 :type columns_maps: dict 9764 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9765 function is a list that stores the additional columns that will be added to the view being created 9766 based on the columns map provided. These columns are generated by exploding the transcript 9767 information columns along with the main transcript column 9768 :type added_columns: list 9769 :param temporary_tables: The `temporary_tables` parameter in the 9770 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9771 tables created during the process of creating a transcript view from a columns map. These temporary 9772 tables are used to store intermediate results or transformations before the final view is generated 9773 :type temporary_tables: list 9774 :param annotation_fields: The `annotation_fields` parameter in the 9775 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9776 for annotation in the query view creation process. These fields are extracted from the 9777 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9778 :type annotation_fields: list 9779 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9780 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9781 """ 9782 9783 log.debug("Start transcrpts view creation from columns map...") 9784 9785 # "from_columns_map": [ 9786 # { 9787 # "transcripts_column": "Ensembl_transcriptid", 9788 # "transcripts_infos_columns": [ 9789 # "genename", 9790 # "Ensembl_geneid", 9791 # "LIST_S2_score", 9792 # "LIST_S2_pred", 9793 # ], 9794 # }, 9795 # { 9796 # "transcripts_column": "Ensembl_transcriptid", 9797 # "transcripts_infos_columns": [ 9798 # "genename", 9799 # "VARITY_R_score", 9800 # "Aloft_pred", 9801 # ], 9802 # }, 9803 # ], 9804 9805 # Init 9806 if temporary_tables is None: 9807 temporary_tables = [] 9808 if annotation_fields is None: 9809 annotation_fields = [] 9810 9811 # Variants table 9812 table_variants = self.get_table_variants() 9813 9814 for columns_map in columns_maps: 9815 9816 # Transcript column 9817 transcripts_column = columns_map.get("transcripts_column", None) 9818 9819 # Transcripts infos columns 9820 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9821 9822 if transcripts_column is not None: 9823 9824 # Explode 9825 added_columns += self.explode_infos( 9826 fields=[transcripts_column] + transcripts_infos_columns 9827 ) 9828 9829 # View clauses 9830 clause_select = [] 9831 for field in [transcripts_column] + transcripts_infos_columns: 9832 clause_select.append( 9833 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9834 ) 9835 if field not in [transcripts_column]: 9836 annotation_fields.append(field) 9837 9838 # Querey View 9839 query = f""" 9840 SELECT 9841 "#CHROM", POS, REF, ALT, INFO, 9842 "{transcripts_column}" AS 'transcript', 9843 {", ".join(clause_select)} 9844 FROM ( 9845 SELECT 9846 "#CHROM", POS, REF, ALT, INFO, 9847 {", ".join(clause_select)} 9848 FROM {table_variants} 9849 ) 9850 WHERE "{transcripts_column}" IS NOT NULL 9851 """ 9852 9853 # Create temporary table 9854 temporary_table = transcripts_table + "".join( 9855 random.choices(string.ascii_uppercase + string.digits, k=10) 9856 ) 9857 9858 # Temporary_tables 9859 temporary_tables.append(temporary_table) 9860 query_view = f""" 9861 CREATE TEMPORARY TABLE {temporary_table} 9862 AS ({query}) 9863 """ 9864 self.execute_query(query=query_view) 9865 9866 return added_columns, temporary_tables, annotation_fields 9867 9868 def create_transcript_view_from_column_format( 9869 self, 9870 transcripts_table: str = "transcripts", 9871 column_formats: dict = {}, 9872 temporary_tables: list = None, 9873 annotation_fields: list = None, 9874 ) -> tuple[list, list, list]: 9875 """ 9876 The `create_transcript_view_from_column_format` function generates a transcript view based on 9877 specified column formats, adds additional columns and annotation fields, and returns the list of 9878 temporary tables and annotation fields. 9879 9880 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9881 the table containing the transcripts data. This table will be used as the base table for creating 9882 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9883 different table name if needed, defaults to transcripts 9884 :type transcripts_table: str (optional) 9885 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9886 about the columns to be used for creating the transcript view. Each entry in the dictionary 9887 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9888 the provided code snippet: 9889 :type column_formats: dict 9890 :param temporary_tables: The `temporary_tables` parameter in the 9891 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9892 views created during the process of creating a transcript view from a column format. These temporary 9893 views are used to manipulate and extract data before generating the final transcript view. It 9894 :type temporary_tables: list 9895 :param annotation_fields: The `annotation_fields` parameter in the 9896 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9897 that are extracted from the temporary views created during the process. These annotation fields are 9898 obtained by querying the temporary views and extracting the column names excluding specific columns 9899 like `#CH 9900 :type annotation_fields: list 9901 :return: The `create_transcript_view_from_column_format` function returns two lists: 9902 `temporary_tables` and `annotation_fields`. 9903 """ 9904 9905 log.debug("Start transcrpts view creation from column format...") 9906 9907 # "from_column_format": [ 9908 # { 9909 # "transcripts_column": "ANN", 9910 # "transcripts_infos_column": "Feature_ID", 9911 # } 9912 # ], 9913 9914 # Init 9915 if temporary_tables is None: 9916 temporary_tables = [] 9917 if annotation_fields is None: 9918 annotation_fields = [] 9919 9920 for column_format in column_formats: 9921 9922 # annotation field and transcript annotation field 9923 annotation_field = column_format.get("transcripts_column", "ANN") 9924 transcript_annotation = column_format.get( 9925 "transcripts_infos_column", "Feature_ID" 9926 ) 9927 9928 # Temporary View name 9929 temporary_view_name = transcripts_table + "".join( 9930 random.choices(string.ascii_uppercase + string.digits, k=10) 9931 ) 9932 9933 # Create temporary view name 9934 temporary_view_name = self.annotation_format_to_table( 9935 uniquify=True, 9936 annotation_field=annotation_field, 9937 view_name=temporary_view_name, 9938 annotation_id=transcript_annotation, 9939 ) 9940 9941 # Annotation fields 9942 if temporary_view_name: 9943 query_annotation_fields = f""" 9944 SELECT * 9945 FROM ( 9946 DESCRIBE SELECT * 9947 FROM {temporary_view_name} 9948 ) 9949 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9950 """ 9951 df_annotation_fields = self.get_query_to_df( 9952 query=query_annotation_fields 9953 ) 9954 9955 # Add temporary view and annotation fields 9956 temporary_tables.append(temporary_view_name) 9957 annotation_fields += list(set(df_annotation_fields["column_name"])) 9958 9959 return temporary_tables, annotation_fields 9960 9961 def create_transcript_view( 9962 self, 9963 transcripts_table: str = None, 9964 transcripts_table_drop: bool = True, 9965 param: dict = {}, 9966 ) -> str: 9967 """ 9968 The `create_transcript_view` function generates a transcript view by processing data from a 9969 specified table based on provided parameters and structural information. 9970 9971 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9972 is used to specify the name of the table that will store the final transcript view data. If a table 9973 name is not provided, the function will create a new table to store the transcript view data, and by 9974 default,, defaults to transcripts 9975 :type transcripts_table: str (optional) 9976 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9977 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9978 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9979 the function will drop the existing transcripts table if it exists, defaults to True 9980 :type transcripts_table_drop: bool (optional) 9981 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9982 contains information needed to create a transcript view. It includes details such as the structure 9983 of the transcripts, columns mapping, column formats, and other necessary information for generating 9984 the view. This parameter allows for flexibility and customization 9985 :type param: dict 9986 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9987 created or modified during the execution of the function. 9988 """ 9989 9990 log.debug("Start transcripts view creation...") 9991 9992 # Default 9993 transcripts_table_default = "transcripts" 9994 9995 # Param 9996 if not param: 9997 param = self.get_param() 9998 9999 # Struct 10000 struct = param.get("transcripts", {}).get("struct", None) 10001 10002 if struct: 10003 10004 # Transcripts table 10005 if transcripts_table is None: 10006 transcripts_table = param.get("transcripts", {}).get( 10007 "table", transcripts_table_default 10008 ) 10009 10010 # added_columns 10011 added_columns = [] 10012 10013 # Temporary tables 10014 temporary_tables = [] 10015 10016 # Annotation fields 10017 annotation_fields = [] 10018 10019 # from columns map 10020 columns_maps = struct.get("from_columns_map", []) 10021 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10022 self.create_transcript_view_from_columns_map( 10023 transcripts_table=transcripts_table, 10024 columns_maps=columns_maps, 10025 added_columns=added_columns, 10026 temporary_tables=temporary_tables, 10027 annotation_fields=annotation_fields, 10028 ) 10029 ) 10030 added_columns += added_columns_tmp 10031 temporary_tables += temporary_tables_tmp 10032 annotation_fields += annotation_fields_tmp 10033 10034 # from column format 10035 column_formats = struct.get("from_column_format", []) 10036 temporary_tables_tmp, annotation_fields_tmp = ( 10037 self.create_transcript_view_from_column_format( 10038 transcripts_table=transcripts_table, 10039 column_formats=column_formats, 10040 temporary_tables=temporary_tables, 10041 annotation_fields=annotation_fields, 10042 ) 10043 ) 10044 temporary_tables += temporary_tables_tmp 10045 annotation_fields += annotation_fields_tmp 10046 10047 # Merge temporary tables query 10048 query_merge = "" 10049 for temporary_table in temporary_tables: 10050 10051 # First temporary table 10052 if not query_merge: 10053 query_merge = f""" 10054 SELECT * FROM {temporary_table} 10055 """ 10056 # other temporary table (using UNION) 10057 else: 10058 query_merge += f""" 10059 UNION BY NAME SELECT * FROM {temporary_table} 10060 """ 10061 10062 # Merge on transcript 10063 query_merge_on_transcripts_annotation_fields = [] 10064 # Aggregate all annotations fields 10065 for annotation_field in set(annotation_fields): 10066 query_merge_on_transcripts_annotation_fields.append( 10067 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10068 ) 10069 # Query for transcripts view 10070 query_merge_on_transcripts = f""" 10071 SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 10072 FROM ({query_merge}) 10073 GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript 10074 """ 10075 10076 # Drop transcript view is necessary 10077 if transcripts_table_drop: 10078 query_drop = f""" 10079 DROP TABLE IF EXISTS {transcripts_table}; 10080 """ 10081 self.execute_query(query=query_drop) 10082 10083 # Merge and create transcript view 10084 query_create_view = f""" 10085 CREATE TABLE IF NOT EXISTS {transcripts_table} 10086 AS {query_merge_on_transcripts} 10087 """ 10088 self.execute_query(query=query_create_view) 10089 10090 # Remove added columns 10091 for added_column in added_columns: 10092 self.drop_column(column=added_column) 10093 10094 else: 10095 10096 transcripts_table = None 10097 10098 return transcripts_table 10099 10100 def annotation_format_to_table( 10101 self, 10102 uniquify: bool = True, 10103 annotation_field: str = "ANN", 10104 annotation_id: str = "Feature_ID", 10105 view_name: str = "transcripts", 10106 ) -> str: 10107 """ 10108 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 10109 table format. 10110 10111 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 10112 values in the output or not. If set to `True`, the function will make sure that the output values 10113 are unique, defaults to True 10114 :type uniquify: bool (optional) 10115 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 10116 contains the annotation information for each variant. This field is used to extract the annotation 10117 details for further processing in the function, defaults to ANN 10118 :type annotation_field: str (optional) 10119 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 10120 used to specify the identifier for the annotation feature. This identifier will be used as a column 10121 name in the resulting table or view that is created based on the annotation data. It helps in 10122 uniquely identifying each annotation entry in the, defaults to Feature_ID 10123 :type annotation_id: str (optional) 10124 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 10125 specify the name of the temporary table that will be created to store the transformed annotation 10126 data. This table will hold the extracted information from the annotation field in a structured 10127 format for further processing or analysis, defaults to transcripts 10128 :type view_name: str (optional) 10129 :return: The function `annotation_format_to_table` is returning the name of the view created, which 10130 is stored in the variable `view_name`. 10131 """ 10132 10133 # Annotation field 10134 annotation_format = "annotation_explode" 10135 10136 # Transcript annotation 10137 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 10138 10139 # Prefix 10140 prefix = self.get_explode_infos_prefix() 10141 if prefix: 10142 prefix = "INFO/" 10143 10144 # Annotation fields 10145 annotation_infos = prefix + annotation_field 10146 annotation_format_infos = prefix + annotation_format 10147 10148 # Variants table 10149 table_variants = self.get_table_variants() 10150 10151 # Header 10152 vcf_reader = self.get_header() 10153 10154 # Add columns 10155 added_columns = [] 10156 10157 # Explode HGVS field in column 10158 added_columns += self.explode_infos(fields=[annotation_field]) 10159 10160 if annotation_field in vcf_reader.infos: 10161 10162 # Extract ANN header 10163 ann_description = vcf_reader.infos[annotation_field].desc 10164 pattern = r"'(.+?)'" 10165 match = re.search(pattern, ann_description) 10166 if match: 10167 ann_header_match = match.group(1).split(" | ") 10168 ann_header = [] 10169 ann_header_desc = {} 10170 for i in range(len(ann_header_match)): 10171 ann_header_info = "".join( 10172 char for char in ann_header_match[i] if char.isalnum() 10173 ) 10174 ann_header.append(ann_header_info) 10175 ann_header_desc[ann_header_info] = ann_header_match[i] 10176 if not ann_header_desc: 10177 raise ValueError("Invalid header description format") 10178 else: 10179 raise ValueError("Invalid header description format") 10180 10181 # Create variant id 10182 variant_id_column = self.get_variant_id_column() 10183 added_columns += [variant_id_column] 10184 10185 # Create dataframe 10186 dataframe_annotation_format = self.get_query_to_df( 10187 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 10188 ) 10189 10190 # Create annotation columns 10191 dataframe_annotation_format[ 10192 annotation_format_infos 10193 ] = dataframe_annotation_format[annotation_infos].apply( 10194 lambda x: explode_annotation_format( 10195 annotation=str(x), 10196 uniquify=uniquify, 10197 output_format="JSON", 10198 prefix="", 10199 header=list(ann_header_desc.values()), 10200 ) 10201 ) 10202 10203 # Find keys 10204 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 10205 df_keys = self.get_query_to_df(query=query_json) 10206 10207 # Check keys 10208 query_json_key = [] 10209 for _, row in df_keys.iterrows(): 10210 10211 # Key 10212 key = row.iloc[0] 10213 10214 # key_clean 10215 key_clean = "".join(char for char in key if char.isalnum()) 10216 10217 # Type 10218 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 10219 10220 # Get DataFrame from query 10221 df_json_type = self.get_query_to_df(query=query_json_type) 10222 10223 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 10224 with pd.option_context("future.no_silent_downcasting", True): 10225 df_json_type.fillna(value="", inplace=True) 10226 replace_dict = {None: np.nan, "": np.nan} 10227 df_json_type.replace(replace_dict, inplace=True) 10228 df_json_type.dropna(inplace=True) 10229 10230 # Detect column type 10231 column_type = detect_column_type(df_json_type[key_clean]) 10232 10233 # Append 10234 query_json_key.append( 10235 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 10236 ) 10237 10238 # Create view 10239 query_view = f""" 10240 CREATE TEMPORARY TABLE {view_name} 10241 AS ( 10242 SELECT *, {annotation_id} AS 'transcript' 10243 FROM ( 10244 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 10245 FROM dataframe_annotation_format 10246 ) 10247 ); 10248 """ 10249 self.execute_query(query=query_view) 10250 10251 else: 10252 10253 # Return None 10254 view_name = None 10255 10256 # Remove added columns 10257 for added_column in added_columns: 10258 self.drop_column(column=added_column) 10259 10260 return view_name 10261 10262 def transcript_view_to_variants( 10263 self, 10264 transcripts_table: str = None, 10265 transcripts_column_id: str = None, 10266 transcripts_info_json: str = None, 10267 transcripts_info_field_json: str = None, 10268 transcripts_info_format: str = None, 10269 transcripts_info_field_format: str = None, 10270 param: dict = {}, 10271 ) -> bool: 10272 """ 10273 The `transcript_view_to_variants` function updates a variants table with information from 10274 transcripts in JSON format. 10275 10276 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 10277 table containing the transcripts data. If this parameter is not provided, the function will 10278 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 10279 :type transcripts_table: str 10280 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 10281 column in the `transcripts_table` that contains the unique identifier for each transcript. This 10282 identifier is used to match transcripts with variants in the database 10283 :type transcripts_column_id: str 10284 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 10285 of the column in the variants table where the transcripts information will be stored in JSON 10286 format. This parameter allows you to define the column in the variants table that will hold the 10287 JSON-formatted information about transcripts 10288 :type transcripts_info_json: str 10289 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 10290 specify the field in the VCF header that will contain information about transcripts in JSON 10291 format. This field will be added to the VCF header as an INFO field with the specified name 10292 :type transcripts_info_field_json: str 10293 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 10294 format of the information about transcripts that will be stored in the variants table. This 10295 format can be used to define how the transcript information will be structured or displayed 10296 within the variants table 10297 :type transcripts_info_format: str 10298 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 10299 specify the field in the VCF header that will contain information about transcripts in a 10300 specific format. This field will be added to the VCF header as an INFO field with the specified 10301 name 10302 :type transcripts_info_field_format: str 10303 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 10304 that contains various configuration settings related to transcripts. It is used to provide 10305 default values for certain parameters if they are not explicitly provided when calling the 10306 method. The `param` dictionary can be passed as an argument 10307 :type param: dict 10308 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 10309 if the operation is successful and `False` if certain conditions are not met. 10310 """ 10311 10312 msg_info_prefix = "Start transcripts view to variants annotations" 10313 10314 log.debug(f"{msg_info_prefix}...") 10315 10316 # Default 10317 transcripts_table_default = "transcripts" 10318 transcripts_column_id_default = "transcript" 10319 transcripts_info_json_default = None 10320 transcripts_info_format_default = None 10321 transcripts_info_field_json_default = None 10322 transcripts_info_field_format_default = None 10323 10324 # Param 10325 if not param: 10326 param = self.get_param() 10327 10328 # Transcripts table 10329 if transcripts_table is None: 10330 transcripts_table = param.get("transcripts", {}).get( 10331 "table", transcripts_table_default 10332 ) 10333 10334 # Transcripts column ID 10335 if transcripts_column_id is None: 10336 transcripts_column_id = param.get("transcripts", {}).get( 10337 "column_id", transcripts_column_id_default 10338 ) 10339 10340 # Transcripts info json 10341 if transcripts_info_json is None: 10342 transcripts_info_json = param.get("transcripts", {}).get( 10343 "transcripts_info_json", transcripts_info_json_default 10344 ) 10345 10346 # Transcripts info field JSON 10347 if transcripts_info_field_json is None: 10348 transcripts_info_field_json = param.get("transcripts", {}).get( 10349 "transcripts_info_field_json", transcripts_info_field_json_default 10350 ) 10351 # if transcripts_info_field_json is not None and transcripts_info_json is None: 10352 # transcripts_info_json = transcripts_info_field_json 10353 10354 # Transcripts info format 10355 if transcripts_info_format is None: 10356 transcripts_info_format = param.get("transcripts", {}).get( 10357 "transcripts_info_format", transcripts_info_format_default 10358 ) 10359 10360 # Transcripts info field FORMAT 10361 if transcripts_info_field_format is None: 10362 transcripts_info_field_format = param.get("transcripts", {}).get( 10363 "transcripts_info_field_format", transcripts_info_field_format_default 10364 ) 10365 # if ( 10366 # transcripts_info_field_format is not None 10367 # and transcripts_info_format is None 10368 # ): 10369 # transcripts_info_format = transcripts_info_field_format 10370 10371 # Variants table 10372 table_variants = self.get_table_variants() 10373 10374 # Check info columns param 10375 if ( 10376 transcripts_info_json is None 10377 and transcripts_info_field_json is None 10378 and transcripts_info_format is None 10379 and transcripts_info_field_format is None 10380 ): 10381 return False 10382 10383 # Transcripts infos columns 10384 query_transcripts_infos_columns = f""" 10385 SELECT * 10386 FROM ( 10387 DESCRIBE SELECT * FROM {transcripts_table} 10388 ) 10389 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 10390 """ 10391 transcripts_infos_columns = list( 10392 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 10393 ) 10394 10395 # View results 10396 clause_select = [] 10397 clause_to_json = [] 10398 clause_to_format = [] 10399 for field in transcripts_infos_columns: 10400 clause_select.append( 10401 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10402 ) 10403 clause_to_json.append(f""" '{field}': "{field}" """) 10404 clause_to_format.append(f""" "{field}" """) 10405 10406 # Update 10407 update_set_json = [] 10408 update_set_format = [] 10409 10410 # VCF header 10411 vcf_reader = self.get_header() 10412 10413 # Transcripts to info column in JSON 10414 if transcripts_info_json is not None: 10415 10416 # Create column on variants table 10417 self.add_column( 10418 table_name=table_variants, 10419 column_name=transcripts_info_json, 10420 column_type="JSON", 10421 default_value=None, 10422 drop=False, 10423 ) 10424 10425 # Add header 10426 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 10427 transcripts_info_json, 10428 ".", 10429 "String", 10430 "Transcripts in JSON format", 10431 "unknwon", 10432 "unknwon", 10433 self.code_type_map["String"], 10434 ) 10435 10436 # Add to update 10437 update_set_json.append( 10438 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 10439 ) 10440 10441 # Transcripts to info field in JSON 10442 if transcripts_info_field_json is not None: 10443 10444 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 10445 10446 # Add to update 10447 update_set_json.append( 10448 f""" 10449 INFO = concat( 10450 CASE 10451 WHEN INFO NOT IN ('', '.') 10452 THEN INFO 10453 ELSE '' 10454 END, 10455 CASE 10456 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 10457 THEN concat( 10458 ';{transcripts_info_field_json}=', 10459 t.{transcripts_info_json} 10460 ) 10461 ELSE '' 10462 END 10463 ) 10464 """ 10465 ) 10466 10467 # Add header 10468 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 10469 transcripts_info_field_json, 10470 ".", 10471 "String", 10472 "Transcripts in JSON format", 10473 "unknwon", 10474 "unknwon", 10475 self.code_type_map["String"], 10476 ) 10477 10478 if update_set_json: 10479 10480 # Update query 10481 query_update = f""" 10482 UPDATE {table_variants} 10483 SET {", ".join(update_set_json)} 10484 FROM 10485 ( 10486 SELECT 10487 "#CHROM", POS, REF, ALT, 10488 concat( 10489 '{{', 10490 string_agg( 10491 '"' || "{transcripts_column_id}" || '":' || 10492 to_json(json_output) 10493 ), 10494 '}}' 10495 )::JSON AS {transcripts_info_json} 10496 FROM 10497 ( 10498 SELECT 10499 "#CHROM", POS, REF, ALT, 10500 "{transcripts_column_id}", 10501 to_json( 10502 {{{",".join(clause_to_json)}}} 10503 )::JSON AS json_output 10504 FROM 10505 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10506 WHERE "{transcripts_column_id}" IS NOT NULL 10507 ) 10508 GROUP BY "#CHROM", POS, REF, ALT 10509 ) AS t 10510 WHERE {table_variants}."#CHROM" = t."#CHROM" 10511 AND {table_variants}."POS" = t."POS" 10512 AND {table_variants}."REF" = t."REF" 10513 AND {table_variants}."ALT" = t."ALT" 10514 """ 10515 10516 self.execute_query(query=query_update) 10517 10518 # Transcripts to info column in FORMAT 10519 if transcripts_info_format is not None: 10520 10521 # Create column on variants table 10522 self.add_column( 10523 table_name=table_variants, 10524 column_name=transcripts_info_format, 10525 column_type="VARCHAR", 10526 default_value=None, 10527 drop=False, 10528 ) 10529 10530 # Add header 10531 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 10532 transcripts_info_format, 10533 ".", 10534 "String", 10535 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10536 "unknwon", 10537 "unknwon", 10538 self.code_type_map["String"], 10539 ) 10540 10541 # Add to update 10542 update_set_format.append( 10543 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 10544 ) 10545 10546 # Transcripts to info field in JSON 10547 if transcripts_info_field_format is not None: 10548 10549 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 10550 10551 # Add to update 10552 update_set_format.append( 10553 f""" 10554 INFO = concat( 10555 CASE 10556 WHEN INFO NOT IN ('', '.') 10557 THEN INFO 10558 ELSE '' 10559 END, 10560 CASE 10561 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 10562 THEN concat( 10563 ';{transcripts_info_field_format}=', 10564 t.{transcripts_info_format} 10565 ) 10566 ELSE '' 10567 END 10568 ) 10569 """ 10570 ) 10571 10572 # Add header 10573 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 10574 transcripts_info_field_format, 10575 ".", 10576 "String", 10577 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10578 "unknwon", 10579 "unknwon", 10580 self.code_type_map["String"], 10581 ) 10582 10583 if update_set_format: 10584 10585 # Update query 10586 query_update = f""" 10587 UPDATE {table_variants} 10588 SET {", ".join(update_set_format)} 10589 FROM 10590 ( 10591 SELECT 10592 "#CHROM", POS, REF, ALT, 10593 string_agg({transcripts_info_format}) AS {transcripts_info_format} 10594 FROM 10595 ( 10596 SELECT 10597 "#CHROM", POS, REF, ALT, 10598 "{transcripts_column_id}", 10599 concat( 10600 "{transcripts_column_id}", 10601 '|', 10602 {", '|', ".join(clause_to_format)} 10603 ) AS {transcripts_info_format} 10604 FROM 10605 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10606 ) 10607 GROUP BY "#CHROM", POS, REF, ALT 10608 ) AS t 10609 WHERE {table_variants}."#CHROM" = t."#CHROM" 10610 AND {table_variants}."POS" = t."POS" 10611 AND {table_variants}."REF" = t."REF" 10612 AND {table_variants}."ALT" = t."ALT" 10613 """ 10614 10615 self.execute_query(query=query_update) 10616 10617 return True
36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Samples 78 self.set_samples() 79 80 # Load data 81 if load: 82 self.load_data()
The function __init__ initializes the variables, sets the input, output, config, param, connexion and
header
Parameters
- conn: the connection to the database
- input: the input file
- output: the output file
- config: a dictionary containing the configuration of the model
- param: a dictionary containing the parameters of the model
84 def set_samples(self, samples: list = None) -> list: 85 """ 86 The function `set_samples` sets the samples attribute of an object to a provided list or 87 retrieves it from a parameter dictionary. 88 89 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 90 input and sets the `samples` attribute of the class to the provided list. If no samples are 91 provided, it tries to get the samples from the class's parameters using the `get_param` method 92 :type samples: list 93 :return: The `samples` list is being returned. 94 """ 95 96 if not samples: 97 samples = self.get_param().get("samples", {}).get("list", None) 98 99 self.samples = samples 100 101 return samples
The function set_samples sets the samples attribute of an object to a provided list or
retrieves it from a parameter dictionary.
Parameters
- samples: The
set_samplesmethod is a method of a class that takes a list of samples as input and sets thesamplesattribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using theget_parammethod
Returns
The
sampleslist is being returned.
103 def get_samples(self) -> list: 104 """ 105 This function returns a list of samples. 106 :return: The `get_samples` method is returning the `samples` attribute of the object. 107 """ 108 109 return self.samples
This function returns a list of samples.
Returns
The
get_samplesmethod is returning thesamplesattribute of the object.
111 def get_samples_check(self) -> bool: 112 """ 113 This function returns the value of the "check" key within the "samples" dictionary retrieved 114 from the parameters. 115 :return: The method `get_samples_check` is returning the value of the key "check" inside the 116 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 117 method. If the key "check" is not found, it will return `False`. 118 """ 119 120 return self.get_param().get("samples", {}).get("check", True)
This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.
Returns
The method
get_samples_checkis returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by theget_param()method. If the key "check" is not found, it will returnFalse.
122 def set_input(self, input: str = None) -> None: 123 """ 124 The function `set_input` takes a file name as input, extracts the name and extension, and sets 125 attributes in the class accordingly. 126 127 :param input: The `set_input` method in the provided code snippet is used to set attributes 128 related to the input file. Here's a breakdown of the parameters and their usage in the method: 129 :type input: str 130 """ 131 132 if input and not isinstance(input, str): 133 try: 134 self.input = input.name 135 except: 136 log.error(f"Input file '{input} in bad format") 137 raise ValueError(f"Input file '{input} in bad format") 138 else: 139 self.input = input 140 141 # Input format 142 if input: 143 input_name, input_extension = os.path.splitext(self.input) 144 self.input_name = input_name 145 self.input_extension = input_extension 146 self.input_format = self.input_extension.replace(".", "")
The function set_input takes a file name as input, extracts the name and extension, and sets
attributes in the class accordingly.
Parameters
- input: The
set_inputmethod in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
148 def set_config(self, config: dict) -> None: 149 """ 150 The set_config function takes a config object and assigns it as the configuration object for the 151 class. 152 153 :param config: The `config` parameter in the `set_config` function is a dictionary object that 154 contains configuration settings for the class. When you call the `set_config` function with a 155 dictionary object as the argument, it will set that dictionary as the configuration object for 156 the class 157 :type config: dict 158 """ 159 160 self.config = config
The set_config function takes a config object and assigns it as the configuration object for the class.
Parameters
- config: The
configparameter in theset_configfunction is a dictionary object that contains configuration settings for the class. When you call theset_configfunction with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
162 def set_param(self, param: dict) -> None: 163 """ 164 This function sets a parameter object for the class based on the input dictionary. 165 166 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 167 as the `param` attribute of the class instance 168 :type param: dict 169 """ 170 171 self.param = param
This function sets a parameter object for the class based on the input dictionary.
Parameters
- param: The
set_parammethod you provided takes a dictionary object as input and sets it as theparamattribute of the class instance
173 def init_variables(self) -> None: 174 """ 175 This function initializes the variables that will be used in the rest of the class 176 """ 177 178 self.prefix = "howard" 179 self.table_variants = "variants" 180 self.dataframe = None 181 182 self.comparison_map = { 183 "gt": ">", 184 "gte": ">=", 185 "lt": "<", 186 "lte": "<=", 187 "equals": "=", 188 "contains": "SIMILAR TO", 189 } 190 191 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 192 193 self.code_type_map_to_sql = { 194 "Integer": "INTEGER", 195 "String": "VARCHAR", 196 "Float": "FLOAT", 197 "Flag": "VARCHAR", 198 } 199 200 self.index_additionnal_fields = []
This function initializes the variables that will be used in the rest of the class
202 def get_indexing(self) -> bool: 203 """ 204 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 205 returns False. 206 :return: The value of the indexing parameter. 207 """ 208 209 return self.get_param().get("indexing", False)
It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.
Returns
The value of the indexing parameter.
211 def get_connexion_config(self) -> dict: 212 """ 213 The function `get_connexion_config` returns a dictionary containing the configuration for a 214 connection, including the number of threads and memory limit. 215 :return: a dictionary containing the configuration for the Connexion library. 216 """ 217 218 # config 219 config = self.get_config() 220 221 # Connexion config 222 connexion_config = {} 223 threads = self.get_threads() 224 225 # Threads 226 if threads: 227 connexion_config["threads"] = threads 228 229 # Memory 230 # if config.get("memory", None): 231 # connexion_config["memory_limit"] = config.get("memory") 232 if self.get_memory(): 233 connexion_config["memory_limit"] = self.get_memory() 234 235 # Temporary directory 236 if config.get("tmp", None): 237 connexion_config["temp_directory"] = config.get("tmp") 238 239 # Access 240 if config.get("access", None): 241 access = config.get("access") 242 if access in ["RO"]: 243 access = "READ_ONLY" 244 elif access in ["RW"]: 245 access = "READ_WRITE" 246 connexion_db = self.get_connexion_db() 247 if connexion_db in ":memory:": 248 access = "READ_WRITE" 249 connexion_config["access_mode"] = access 250 251 return connexion_config
The function get_connexion_config returns a dictionary containing the configuration for a
connection, including the number of threads and memory limit.
Returns
a dictionary containing the configuration for the Connexion library.
253 def get_duckdb_settings(self) -> dict: 254 """ 255 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 256 string. 257 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 258 """ 259 260 # config 261 config = self.get_config() 262 263 # duckdb settings 264 duckdb_settings_dict = {} 265 if config.get("duckdb_settings", None): 266 duckdb_settings = config.get("duckdb_settings") 267 duckdb_settings = full_path(duckdb_settings) 268 # duckdb setting is a file 269 if os.path.exists(duckdb_settings): 270 with open(duckdb_settings) as json_file: 271 duckdb_settings_dict = yaml.safe_load(json_file) 272 # duckdb settings is a string 273 else: 274 duckdb_settings_dict = json.loads(duckdb_settings) 275 276 return duckdb_settings_dict
The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a
string.
Returns
The function
get_duckdb_settingsreturns a dictionary objectduckdb_settings_dict.
278 def set_connexion_db(self) -> str: 279 """ 280 The function `set_connexion_db` returns the appropriate database connection string based on the 281 input format and connection type. 282 :return: the value of the variable `connexion_db`. 283 """ 284 285 # Default connexion db 286 default_connexion_db = ":memory:" 287 288 # Find connexion db 289 if self.get_input_format() in ["db", "duckdb"]: 290 connexion_db = self.get_input() 291 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 292 connexion_db = default_connexion_db 293 elif self.get_connexion_type() in ["tmpfile"]: 294 tmp_name = tempfile.mkdtemp( 295 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 296 ) 297 connexion_db = f"{tmp_name}/tmp.db" 298 elif self.get_connexion_type() != "": 299 connexion_db = self.get_connexion_type() 300 else: 301 connexion_db = default_connexion_db 302 303 # Set connexion db 304 self.connexion_db = connexion_db 305 306 return connexion_db
The function set_connexion_db returns the appropriate database connection string based on the
input format and connection type.
Returns
the value of the variable
connexion_db.
308 def set_connexion(self, conn) -> None: 309 """ 310 The function `set_connexion` creates a connection to a database, with options for different 311 database formats and settings. 312 313 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 314 database. If a connection is not provided, a new connection to an in-memory database is created. 315 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 316 sqlite 317 """ 318 319 # Connexion db 320 connexion_db = self.set_connexion_db() 321 322 # Connexion config 323 connexion_config = self.get_connexion_config() 324 325 # Connexion format 326 connexion_format = self.get_config().get("connexion_format", "duckdb") 327 # Set connexion format 328 self.connexion_format = connexion_format 329 330 # Connexion 331 if not conn: 332 if connexion_format in ["duckdb"]: 333 conn = duckdb.connect(connexion_db, config=connexion_config) 334 # duckDB settings 335 duckdb_settings = self.get_duckdb_settings() 336 if duckdb_settings: 337 for setting in duckdb_settings: 338 setting_value = duckdb_settings.get(setting) 339 if isinstance(setting_value, str): 340 setting_value = f"'{setting_value}'" 341 conn.execute(f"PRAGMA {setting}={setting_value};") 342 elif connexion_format in ["sqlite"]: 343 conn = sqlite3.connect(connexion_db) 344 345 # Set connexion 346 self.conn = conn 347 348 # Log 349 log.debug(f"connexion_format: {connexion_format}") 350 log.debug(f"connexion_db: {connexion_db}") 351 log.debug(f"connexion config: {connexion_config}") 352 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
The function set_connexion creates a connection to a database, with options for different
database formats and settings.
Parameters
- conn: The
connparameter in theset_connexionmethod is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
354 def set_output(self, output: str = None) -> None: 355 """ 356 The `set_output` function in Python sets the output file based on the input or a specified key 357 in the config file, extracting the output name, extension, and format. 358 359 :param output: The `output` parameter in the `set_output` method is used to specify the name of 360 the output file. If the config file has an 'output' key, the method sets the output to the value 361 of that key. If no output is provided, it sets the output to `None` 362 :type output: str 363 """ 364 365 if output and not isinstance(output, str): 366 self.output = output.name 367 else: 368 self.output = output 369 370 # Output format 371 if self.output: 372 output_name, output_extension = os.path.splitext(self.output) 373 self.output_name = output_name 374 self.output_extension = output_extension 375 self.output_format = self.output_extension.replace(".", "") 376 else: 377 self.output_name = None 378 self.output_extension = None 379 self.output_format = None
The set_output function in Python sets the output file based on the input or a specified key
in the config file, extracting the output name, extension, and format.
Parameters
- output: The
outputparameter in theset_outputmethod is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output toNone
381 def set_header(self) -> None: 382 """ 383 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 384 """ 385 386 input_file = self.get_input() 387 default_header_list = [ 388 "##fileformat=VCFv4.2", 389 "#CHROM POS ID REF ALT QUAL FILTER INFO", 390 ] 391 392 # Full path 393 input_file = full_path(input_file) 394 395 if input_file: 396 397 input_format = self.get_input_format() 398 input_compressed = self.get_input_compressed() 399 config = self.get_config() 400 header_list = default_header_list 401 if input_format in [ 402 "vcf", 403 "hdr", 404 "tsv", 405 "csv", 406 "psv", 407 "parquet", 408 "db", 409 "duckdb", 410 ]: 411 # header provided in param 412 if config.get("header_file", None): 413 with open(config.get("header_file"), "rt") as f: 414 header_list = self.read_vcf_header(f) 415 # within a vcf file format (header within input file itsself) 416 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 417 # within a compressed vcf file format (.vcf.gz) 418 if input_compressed: 419 with bgzf.open(input_file, "rt") as f: 420 header_list = self.read_vcf_header(f) 421 # within an uncompressed vcf file format (.vcf) 422 else: 423 with open(input_file, "rt") as f: 424 header_list = self.read_vcf_header(f) 425 # header provided in default external file .hdr 426 elif os.path.exists((input_file + ".hdr")): 427 with open(input_file + ".hdr", "rt") as f: 428 header_list = self.read_vcf_header(f) 429 else: 430 try: # Try to get header info fields and file columns 431 432 with tempfile.TemporaryDirectory() as tmpdir: 433 434 # Create database 435 db_for_header = Database(database=input_file) 436 437 # Get header columns for infos fields 438 db_header_from_columns = ( 439 db_for_header.get_header_from_columns() 440 ) 441 442 # Get real columns in the file 443 db_header_columns = db_for_header.get_columns() 444 445 # Write header file 446 header_file_tmp = os.path.join(tmpdir, "header") 447 f = open(header_file_tmp, "w") 448 vcf.Writer(f, db_header_from_columns) 449 f.close() 450 451 # Replace #CHROM line with rel columns 452 header_list = db_for_header.read_header_file( 453 header_file=header_file_tmp 454 ) 455 header_list[-1] = "\t".join(db_header_columns) 456 457 except: 458 459 log.warning( 460 f"No header for file {input_file}. Set as default VCF header" 461 ) 462 header_list = default_header_list 463 464 else: # try for unknown format ? 465 466 log.error(f"Input file format '{input_format}' not available") 467 raise ValueError(f"Input file format '{input_format}' not available") 468 469 if not header_list: 470 header_list = default_header_list 471 472 # header as list 473 self.header_list = header_list 474 475 # header as VCF object 476 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 477 478 else: 479 480 self.header_list = None 481 self.header_vcf = None
It reads the header of a VCF file and stores it as a list of strings and as a VCF object
483 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 484 """ 485 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 486 DataFrame based on the connection format. 487 488 :param query: The `query` parameter in the `get_query_to_df` function is a string that 489 represents the SQL query you want to execute. This query will be used to fetch data from a 490 database and convert it into a pandas DataFrame 491 :type query: str 492 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 493 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 494 function will only fetch up to that number of rows from the database query result. If no limit 495 is specified, 496 :type limit: int 497 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 498 """ 499 500 # Connexion format 501 connexion_format = self.get_connexion_format() 502 503 # Limit in query 504 if limit: 505 pd.set_option("display.max_rows", limit) 506 if connexion_format in ["duckdb"]: 507 df = ( 508 self.conn.execute(query) 509 .fetch_record_batch(limit) 510 .read_next_batch() 511 .to_pandas() 512 ) 513 elif connexion_format in ["sqlite"]: 514 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 515 516 # Full query 517 else: 518 if connexion_format in ["duckdb"]: 519 df = self.conn.execute(query).df() 520 elif connexion_format in ["sqlite"]: 521 df = pd.read_sql_query(query, self.conn) 522 523 return df
The get_query_to_df function takes a query as a string and returns the result as a pandas
DataFrame based on the connection format.
Parameters
- query: The
queryparameter in theget_query_to_dffunction is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame - limit: The
limitparameter in theget_query_to_dffunction is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns
A pandas DataFrame is being returned by the
get_query_to_dffunction.
525 def get_overview(self) -> None: 526 """ 527 The function prints the input, output, config, and dataframe of the current object 528 """ 529 table_variants_from = self.get_table_variants(clause="from") 530 sql_columns = self.get_header_columns_as_sql() 531 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 532 df = self.get_query_to_df(sql_query_export) 533 log.info( 534 "Input: " 535 + str(self.get_input()) 536 + " [" 537 + str(str(self.get_input_format())) 538 + "]" 539 ) 540 log.info( 541 "Output: " 542 + str(self.get_output()) 543 + " [" 544 + str(str(self.get_output_format())) 545 + "]" 546 ) 547 log.info("Config: ") 548 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 549 "\n" 550 ): 551 log.info("\t" + str(d)) 552 log.info("Param: ") 553 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 554 "\n" 555 ): 556 log.info("\t" + str(d)) 557 log.info("Sample list: " + str(self.get_header_sample_list())) 558 log.info("Dataframe: ") 559 for d in str(df).split("\n"): 560 log.info("\t" + str(d)) 561 562 # garbage collector 563 del df 564 gc.collect() 565 566 return None
The function prints the input, output, config, and dataframe of the current object
568 def get_stats(self) -> dict: 569 """ 570 The `get_stats` function calculates and returns various statistics of the current object, 571 including information about the input file, variants, samples, header fields, quality, and 572 SNVs/InDels. 573 :return: a dictionary containing various statistics of the current object. The dictionary has 574 the following structure: 575 """ 576 577 # Log 578 log.info(f"Stats Calculation...") 579 580 # table varaints 581 table_variants_from = self.get_table_variants() 582 583 # stats dict 584 stats = {"Infos": {}} 585 586 ### File 587 input_file = self.get_input() 588 stats["Infos"]["Input file"] = input_file 589 590 # Header 591 header_infos = self.get_header().infos 592 header_formats = self.get_header().formats 593 header_infos_list = list(header_infos) 594 header_formats_list = list(header_formats) 595 596 ### Variants 597 598 stats["Variants"] = {} 599 600 # Variants by chr 601 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 602 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 603 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 604 by=["CHROM"], kind="quicksort" 605 ) 606 607 # Total number of variants 608 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 609 610 # Calculate percentage 611 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 612 lambda x: (x / nb_of_variants) 613 ) 614 615 stats["Variants"]["Number of variants by chromosome"] = ( 616 nb_of_variants_by_chrom.to_dict(orient="index") 617 ) 618 619 stats["Infos"]["Number of variants"] = int(nb_of_variants) 620 621 ### Samples 622 623 # Init 624 samples = {} 625 nb_of_samples = 0 626 627 # Check Samples 628 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 629 log.debug(f"Check samples...") 630 for sample in self.get_header_sample_list(): 631 sql_query_samples = f""" 632 SELECT '{sample}' as sample, 633 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 634 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 635 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 636 FROM {table_variants_from} 637 WHERE ( 638 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 639 AND 640 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 641 ) 642 GROUP BY genotype 643 """ 644 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 645 sample_genotype_count = sql_query_genotype_df["count"].sum() 646 if len(sql_query_genotype_df): 647 nb_of_samples += 1 648 samples[f"{sample} - {sample_genotype_count} variants"] = ( 649 sql_query_genotype_df.to_dict(orient="index") 650 ) 651 652 stats["Samples"] = samples 653 stats["Infos"]["Number of samples"] = nb_of_samples 654 655 # # 656 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 657 # stats["Infos"]["Number of samples"] = nb_of_samples 658 # elif nb_of_samples: 659 # stats["Infos"]["Number of samples"] = "not a VCF format" 660 661 ### INFO and FORMAT fields 662 header_types_df = {} 663 header_types_list = { 664 "List of INFO fields": header_infos, 665 "List of FORMAT fields": header_formats, 666 } 667 i = 0 668 for header_type in header_types_list: 669 670 header_type_infos = header_types_list.get(header_type) 671 header_infos_dict = {} 672 673 for info in header_type_infos: 674 675 i += 1 676 header_infos_dict[i] = {} 677 678 # ID 679 header_infos_dict[i]["id"] = info 680 681 # num 682 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 683 if header_type_infos[info].num in genotype_map.keys(): 684 header_infos_dict[i]["Number"] = genotype_map.get( 685 header_type_infos[info].num 686 ) 687 else: 688 header_infos_dict[i]["Number"] = header_type_infos[info].num 689 690 # type 691 if header_type_infos[info].type: 692 header_infos_dict[i]["Type"] = header_type_infos[info].type 693 else: 694 header_infos_dict[i]["Type"] = "." 695 696 # desc 697 if header_type_infos[info].desc != None: 698 header_infos_dict[i]["Description"] = header_type_infos[info].desc 699 else: 700 header_infos_dict[i]["Description"] = "" 701 702 if len(header_infos_dict): 703 header_types_df[header_type] = pd.DataFrame.from_dict( 704 header_infos_dict, orient="index" 705 ).to_dict(orient="index") 706 707 # Stats 708 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 709 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 710 stats["Header"] = header_types_df 711 712 ### QUAL 713 if "QUAL" in self.get_header_columns(): 714 sql_query_qual = f""" 715 SELECT 716 avg(CAST(QUAL AS INTEGER)) AS Average, 717 min(CAST(QUAL AS INTEGER)) AS Minimum, 718 max(CAST(QUAL AS INTEGER)) AS Maximum, 719 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 720 median(CAST(QUAL AS INTEGER)) AS Median, 721 variance(CAST(QUAL AS INTEGER)) AS Variance 722 FROM {table_variants_from} 723 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 724 """ 725 726 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 727 stats["Quality"] = {"Stats": qual} 728 729 ### SNV and InDel 730 731 sql_query_snv = f""" 732 733 SELECT Type, count FROM ( 734 735 SELECT 736 'Total' AS Type, 737 count(*) AS count 738 FROM {table_variants_from} 739 740 UNION 741 742 SELECT 743 'MNV' AS Type, 744 count(*) AS count 745 FROM {table_variants_from} 746 WHERE len(REF) > 1 AND len(ALT) > 1 747 AND len(REF) = len(ALT) 748 749 UNION 750 751 SELECT 752 'InDel' AS Type, 753 count(*) AS count 754 FROM {table_variants_from} 755 WHERE len(REF) > 1 OR len(ALT) > 1 756 AND len(REF) != len(ALT) 757 758 UNION 759 760 SELECT 761 'SNV' AS Type, 762 count(*) AS count 763 FROM {table_variants_from} 764 WHERE len(REF) = 1 AND len(ALT) = 1 765 766 ) 767 768 ORDER BY count DESC 769 770 """ 771 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 772 773 sql_query_snv_substitution = f""" 774 SELECT 775 concat(REF, '>', ALT) AS 'Substitution', 776 count(*) AS count 777 FROM {table_variants_from} 778 WHERE len(REF) = 1 AND len(ALT) = 1 779 GROUP BY REF, ALT 780 ORDER BY count(*) DESC 781 """ 782 snv_substitution = ( 783 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 784 ) 785 stats["Variants"]["Counts"] = snv_indel 786 stats["Variants"]["Substitutions"] = snv_substitution 787 788 return stats
The get_stats function calculates and returns various statistics of the current object,
including information about the input file, variants, samples, header fields, quality, and
SNVs/InDels.
Returns
a dictionary containing various statistics of the current object. The dictionary has the following structure:
790 def stats_to_file(self, file: str = None) -> str: 791 """ 792 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 793 into a JSON object, and writes the JSON object to the specified file. 794 795 :param file: The `file` parameter is a string that represents the file path where the JSON data 796 will be written 797 :type file: str 798 :return: the name of the file that was written to. 799 """ 800 801 # Get stats 802 stats = self.get_stats() 803 804 # Serializing json 805 json_object = json.dumps(stats, indent=4) 806 807 # Writing to sample.json 808 with open(file, "w") as outfile: 809 outfile.write(json_object) 810 811 return file
The function stats_to_file takes a file name as input, retrieves statistics, serializes them
into a JSON object, and writes the JSON object to the specified file.
Parameters
- file: The
fileparameter is a string that represents the file path where the JSON data will be written
Returns
the name of the file that was written to.
813 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 814 """ 815 The `print_stats` function generates a markdown file and prints the statistics contained in a 816 JSON file in a formatted manner. 817 818 :param output_file: The `output_file` parameter is a string that specifies the path and filename 819 of the output file where the stats will be printed in Markdown format. If no `output_file` is 820 provided, a temporary directory will be created and the stats will be saved in a file named 821 "stats.md" within that 822 :type output_file: str 823 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 824 file where the statistics will be saved. If no value is provided, a temporary directory will be 825 created and a default file name "stats.json" will be used 826 :type json_file: str 827 :return: The function `print_stats` does not return any value. It has a return type annotation 828 of `None`. 829 """ 830 831 # Full path 832 output_file = full_path(output_file) 833 json_file = full_path(json_file) 834 835 with tempfile.TemporaryDirectory() as tmpdir: 836 837 # Files 838 if not output_file: 839 output_file = os.path.join(tmpdir, "stats.md") 840 if not json_file: 841 json_file = os.path.join(tmpdir, "stats.json") 842 843 # Create folders 844 if not os.path.exists(os.path.dirname(output_file)): 845 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 846 if not os.path.exists(os.path.dirname(json_file)): 847 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 848 849 # Create stats JSON file 850 stats_file = self.stats_to_file(file=json_file) 851 852 # Print stats file 853 with open(stats_file) as f: 854 stats = yaml.safe_load(f) 855 856 # Output 857 output_title = [] 858 output_index = [] 859 output = [] 860 861 # Title 862 output_title.append("# HOWARD Stats") 863 864 # Index 865 output_index.append("## Index") 866 867 # Process sections 868 for section in stats: 869 infos = stats.get(section) 870 section_link = "#" + section.lower().replace(" ", "-") 871 output.append(f"## {section}") 872 output_index.append(f"- [{section}]({section_link})") 873 874 if len(infos): 875 for info in infos: 876 try: 877 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 878 is_df = True 879 except: 880 try: 881 df = pd.DataFrame.from_dict( 882 json.loads((infos.get(info))), orient="index" 883 ) 884 is_df = True 885 except: 886 is_df = False 887 if is_df: 888 output.append(f"### {info}") 889 info_link = "#" + info.lower().replace(" ", "-") 890 output_index.append(f" - [{info}]({info_link})") 891 output.append(f"{df.to_markdown(index=False)}") 892 else: 893 output.append(f"- {info}: {infos.get(info)}") 894 else: 895 output.append(f"NA") 896 897 # Write stats in markdown file 898 with open(output_file, "w") as fp: 899 for item in output_title: 900 fp.write("%s\n" % item) 901 for item in output_index: 902 fp.write("%s\n" % item) 903 for item in output: 904 fp.write("%s\n" % item) 905 906 # Output stats in markdown 907 print("") 908 print("\n\n".join(output_title)) 909 print("") 910 print("\n\n".join(output)) 911 print("") 912 913 return None
The print_stats function generates a markdown file and prints the statistics contained in a
JSON file in a formatted manner.
Parameters
- output_file: The
output_fileparameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If nooutput_fileis provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that - json_file: The
json_fileparameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns
The function
print_statsdoes not return any value. It has a return type annotation ofNone.
915 def get_input(self) -> str: 916 """ 917 It returns the value of the input variable. 918 :return: The input is being returned. 919 """ 920 return self.input
It returns the value of the input variable.
Returns
The input is being returned.
922 def get_input_format(self, input_file: str = None) -> str: 923 """ 924 This function returns the format of the input variable, either from the provided input file or 925 by prompting for input. 926 927 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 928 represents the file path of the input file. If no `input_file` is provided when calling the 929 method, it will default to `None` 930 :type input_file: str 931 :return: The format of the input variable is being returned. 932 """ 933 934 if not input_file: 935 input_file = self.get_input() 936 input_format = get_file_format(input_file) 937 return input_format
This function returns the format of the input variable, either from the provided input file or by prompting for input.
Parameters
- input_file: The
input_fileparameter in theget_input_formatmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNone
Returns
The format of the input variable is being returned.
939 def get_input_compressed(self, input_file: str = None) -> str: 940 """ 941 The function `get_input_compressed` returns the format of the input variable after compressing 942 it. 943 944 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 945 that represents the file path of the input file. If no `input_file` is provided when calling the 946 method, it will default to `None` and the method will then call `self.get_input()` to 947 :type input_file: str 948 :return: The function `get_input_compressed` returns the compressed format of the input 949 variable. 950 """ 951 952 if not input_file: 953 input_file = self.get_input() 954 input_compressed = get_file_compressed(input_file) 955 return input_compressed
The function get_input_compressed returns the format of the input variable after compressing
it.
Parameters
- input_file: The
input_fileparameter in theget_input_compressedmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNoneand the method will then callself.get_input()to
Returns
The function
get_input_compressedreturns the compressed format of the input variable.
957 def get_output(self) -> str: 958 """ 959 It returns the output of the neuron. 960 :return: The output of the neural network. 961 """ 962 963 return self.output
It returns the output of the neuron.
Returns
The output of the neural network.
965 def get_output_format(self, output_file: str = None) -> str: 966 """ 967 The function `get_output_format` returns the format of the input variable or the output file if 968 provided. 969 970 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 971 that represents the file path of the output file. If no `output_file` is provided when calling 972 the method, it will default to the output obtained from the `get_output` method of the class 973 instance. The 974 :type output_file: str 975 :return: The format of the input variable is being returned. 976 """ 977 978 if not output_file: 979 output_file = self.get_output() 980 output_format = get_file_format(output_file) 981 982 return output_format
The function get_output_format returns the format of the input variable or the output file if
provided.
Parameters
- output_file: The
output_fileparameter in theget_output_formatmethod is a string that represents the file path of the output file. If nooutput_fileis provided when calling the method, it will default to the output obtained from theget_outputmethod of the class instance. The
Returns
The format of the input variable is being returned.
984 def get_config(self) -> dict: 985 """ 986 It returns the config 987 :return: The config variable is being returned. 988 """ 989 return self.config
It returns the config
Returns
The config variable is being returned.
991 def get_param(self) -> dict: 992 """ 993 It returns the param 994 :return: The param variable is being returned. 995 """ 996 return self.param
It returns the param
Returns
The param variable is being returned.
998 def get_connexion_db(self) -> str: 999 """ 1000 It returns the connexion_db attribute of the object 1001 :return: The connexion_db is being returned. 1002 """ 1003 return self.connexion_db
It returns the connexion_db attribute of the object
Returns
The connexion_db is being returned.
1005 def get_prefix(self) -> str: 1006 """ 1007 It returns the prefix of the object. 1008 :return: The prefix is being returned. 1009 """ 1010 return self.prefix
It returns the prefix of the object.
Returns
The prefix is being returned.
1012 def get_table_variants(self, clause: str = "select") -> str: 1013 """ 1014 This function returns the table_variants attribute of the object 1015 1016 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1017 defaults to select (optional) 1018 :return: The table_variants attribute of the object. 1019 """ 1020 1021 # Access 1022 access = self.get_config().get("access", None) 1023 1024 # Clauses "select", "where", "update" 1025 if clause in ["select", "where", "update"]: 1026 table_variants = self.table_variants 1027 # Clause "from" 1028 elif clause in ["from"]: 1029 # For Read Only 1030 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1031 input_file = self.get_input() 1032 table_variants = f"'{input_file}' as variants" 1033 # For Read Write 1034 else: 1035 table_variants = f"{self.table_variants} as variants" 1036 else: 1037 table_variants = self.table_variants 1038 return table_variants
This function returns the table_variants attribute of the object
Parameters
- clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns
The table_variants attribute of the object.
1040 def get_tmp_dir(self) -> str: 1041 """ 1042 The function `get_tmp_dir` returns the temporary directory path based on configuration 1043 parameters or a default path. 1044 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1045 configuration, parameters, and a default value of "/tmp". 1046 """ 1047 1048 return get_tmp( 1049 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1050 )
The function get_tmp_dir returns the temporary directory path based on configuration
parameters or a default path.
Returns
The
get_tmp_dirmethod is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".
1052 def get_connexion_type(self) -> str: 1053 """ 1054 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1055 1056 :return: The connexion type is being returned. 1057 """ 1058 return self.get_config().get("connexion_type", "memory")
If the connexion type is not in the list of allowed connexion types, raise a ValueError
Returns
The connexion type is being returned.
1060 def get_connexion(self): 1061 """ 1062 It returns the connection object 1063 1064 :return: The connection object. 1065 """ 1066 return self.conn
It returns the connection object
Returns
The connection object.
1068 def close_connexion(self) -> None: 1069 """ 1070 This function closes the connection to the database. 1071 :return: The connection is being closed. 1072 """ 1073 return self.conn.close()
This function closes the connection to the database.
Returns
The connection is being closed.
1075 def get_header(self, type: str = "vcf"): 1076 """ 1077 This function returns the header of the VCF file as a list of strings 1078 1079 :param type: the type of header you want to get, defaults to vcf (optional) 1080 :return: The header of the vcf file. 1081 """ 1082 1083 if self.header_vcf: 1084 if type == "vcf": 1085 return self.header_vcf 1086 elif type == "list": 1087 return self.header_list 1088 else: 1089 if type == "vcf": 1090 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1091 return header 1092 elif type == "list": 1093 return vcf_required
This function returns the header of the VCF file as a list of strings
Parameters
- type: the type of header you want to get, defaults to vcf (optional)
Returns
The header of the vcf file.
1095 def get_header_length(self, file: str = None) -> int: 1096 """ 1097 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1098 line. 1099 1100 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1101 header file. If this argument is provided, the function will read the header from the specified 1102 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1103 :type file: str 1104 :return: the length of the header list, excluding the #CHROM line. 1105 """ 1106 1107 if file: 1108 return len(self.read_vcf_header_file(file=file)) - 1 1109 elif self.get_header(type="list"): 1110 return len(self.get_header(type="list")) - 1 1111 else: 1112 return 0
The function get_header_length returns the length of the header list, excluding the #CHROM
line.
Parameters
- file: The
fileparameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns
the length of the header list, excluding the #CHROM line.
1114 def get_header_columns(self) -> str: 1115 """ 1116 This function returns the header list of a VCF 1117 1118 :return: The length of the header list. 1119 """ 1120 if self.get_header(): 1121 return self.get_header(type="list")[-1] 1122 else: 1123 return ""
This function returns the header list of a VCF
Returns
The length of the header list.
1125 def get_header_columns_as_list(self) -> list: 1126 """ 1127 This function returns the header list of a VCF 1128 1129 :return: The length of the header list. 1130 """ 1131 if self.get_header(): 1132 return self.get_header_columns().strip().split("\t") 1133 else: 1134 return []
This function returns the header list of a VCF
Returns
The length of the header list.
1136 def get_header_columns_as_sql(self) -> str: 1137 """ 1138 This function retruns header length (without #CHROM line) 1139 1140 :return: The length of the header list. 1141 """ 1142 sql_column_list = [] 1143 for col in self.get_header_columns_as_list(): 1144 sql_column_list.append(f'"{col}"') 1145 return ",".join(sql_column_list)
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1147 def get_header_sample_list( 1148 self, check: bool = False, samples: list = None, samples_force: bool = False 1149 ) -> list: 1150 """ 1151 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1152 checking and filtering based on input parameters. 1153 1154 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1155 parameter that determines whether to check if the samples in the list are properly defined as 1156 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1157 list is defined as a, defaults to False 1158 :type check: bool (optional) 1159 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1160 allows you to specify a subset of samples from the header. If you provide a list of sample 1161 names, the function will check if each sample is defined in the header. If a sample is not found 1162 in the 1163 :type samples: list 1164 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1165 a boolean parameter that determines whether to force the function to return the sample list 1166 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1167 function will return the sample list without performing, defaults to False 1168 :type samples_force: bool (optional) 1169 :return: The function `get_header_sample_list` returns a list of samples based on the input 1170 parameters and conditions specified in the function. 1171 """ 1172 1173 # Init 1174 samples_list = [] 1175 1176 if samples is None: 1177 samples_list = self.header_vcf.samples 1178 else: 1179 samples_checked = [] 1180 for sample in samples: 1181 if sample in self.header_vcf.samples: 1182 samples_checked.append(sample) 1183 else: 1184 log.warning(f"Sample '{sample}' not defined in header") 1185 samples_list = samples_checked 1186 1187 # Force sample list without checking if is_genotype_column 1188 if samples_force: 1189 log.warning(f"Samples {samples_list} not checked if genotypes") 1190 return samples_list 1191 1192 if check: 1193 samples_checked = [] 1194 for sample in samples_list: 1195 if self.is_genotype_column(column=sample): 1196 samples_checked.append(sample) 1197 else: 1198 log.warning( 1199 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1200 ) 1201 samples_list = samples_checked 1202 1203 # Return samples list 1204 return samples_list
The function get_header_sample_list returns a list of samples from a VCF header, with optional
checking and filtering based on input parameters.
Parameters
- check: The
checkparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. Ifcheckis set toTrue, the function will verify if each sample in the list is defined as a, defaults to False - samples: The
samplesparameter in theget_header_sample_listfunction is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the - samples_force: The
samples_forceparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. Ifsamples_forceis set toTrue, the function will return the sample list without performing, defaults to False
Returns
The function
get_header_sample_listreturns a list of samples based on the input parameters and conditions specified in the function.
1206 def is_genotype_column(self, column: str = None) -> bool: 1207 """ 1208 This function checks if a given column is a genotype column in a database. 1209 1210 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1211 represents the column name in a database table. This method checks if the specified column is a 1212 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1213 method of 1214 :type column: str 1215 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1216 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1217 column name and returns the result. If the `column` parameter is None, it returns False. 1218 """ 1219 1220 if column is not None: 1221 return Database(database=self.get_input()).is_genotype_column(column=column) 1222 else: 1223 return False
This function checks if a given column is a genotype column in a database.
Parameters
- column: The
columnparameter in theis_genotype_columnmethod is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls theis_genotype_columnmethod of
Returns
The
is_genotype_columnmethod is returning a boolean value. If thecolumnparameter is not None, it calls theis_genotype_columnmethod of theDatabaseclass with the specified column name and returns the result. If thecolumnparameter is None, it returns False.
1225 def get_verbose(self) -> bool: 1226 """ 1227 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1228 exist 1229 1230 :return: The value of the key "verbose" in the config dictionary. 1231 """ 1232 return self.get_config().get("verbose", False)
It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist
Returns
The value of the key "verbose" in the config dictionary.
1234 def get_connexion_format(self) -> str: 1235 """ 1236 It returns the connexion format of the object. 1237 :return: The connexion_format is being returned. 1238 """ 1239 connexion_format = self.connexion_format 1240 if connexion_format not in ["duckdb", "sqlite"]: 1241 log.error(f"Unknown connexion format {connexion_format}") 1242 raise ValueError(f"Unknown connexion format {connexion_format}") 1243 else: 1244 return connexion_format
It returns the connexion format of the object.
Returns
The connexion_format is being returned.
1246 def insert_file_to_table( 1247 self, 1248 file, 1249 columns: str, 1250 header_len: int = 0, 1251 sep: str = "\t", 1252 chunksize: int = 1000000, 1253 ) -> None: 1254 """ 1255 The function reads a file in chunks and inserts each chunk into a table based on the specified 1256 database format. 1257 1258 :param file: The `file` parameter is the file that you want to load into a table. It should be 1259 the path to the file on your system 1260 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1261 should contain the names of the columns in the table where the data will be inserted. The column 1262 names should be separated by commas within the string. For example, if you have columns named 1263 "id", "name 1264 :type columns: str 1265 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1266 the number of lines to skip at the beginning of the file before reading the actual data. This 1267 parameter allows you to skip any header information present in the file before processing the 1268 data, defaults to 0 1269 :type header_len: int (optional) 1270 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1271 separator character that is used in the file being read. In this case, the default separator is 1272 set to `\t`, which represents a tab character. You can change this parameter to a different 1273 separator character if, defaults to \t 1274 :type sep: str (optional) 1275 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1276 when processing the file in chunks. In the provided code snippet, the default value for 1277 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1278 to 1000000 1279 :type chunksize: int (optional) 1280 """ 1281 1282 # Config 1283 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1284 connexion_format = self.get_connexion_format() 1285 1286 log.debug("chunksize: " + str(chunksize)) 1287 1288 if chunksize: 1289 for chunk in pd.read_csv( 1290 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1291 ): 1292 if connexion_format in ["duckdb"]: 1293 sql_insert_into = ( 1294 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1295 ) 1296 self.conn.execute(sql_insert_into) 1297 elif connexion_format in ["sqlite"]: 1298 chunk.to_sql("variants", self.conn, if_exists="append", index=False)
The function reads a file in chunks and inserts each chunk into a table based on the specified database format.
Parameters
- file: The
fileparameter is the file that you want to load into a table. It should be the path to the file on your system - columns: The
columnsparameter in theinsert_file_to_tablefunction is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name - header_len: The
header_lenparameter in theinsert_file_to_tablefunction specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0 - sep: The
sepparameter in theinsert_file_to_tablefunction is used to specify the separator character that is used in the file being read. In this case, the default separator is set to, which represents a tab character. You can change this parameter to a different separator character if, defaults to - chunksize: The
chunksizeparameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value forchunksizeis set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
1300 def load_data( 1301 self, 1302 input_file: str = None, 1303 drop_variants_table: bool = False, 1304 sample_size: int = 20480, 1305 ) -> None: 1306 """ 1307 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1308 table before loading the data and specify a sample size. 1309 1310 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1311 table 1312 :type input_file: str 1313 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1314 determines whether the variants table should be dropped before loading the data. If set to 1315 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1316 not be dropped, defaults to False 1317 :type drop_variants_table: bool (optional) 1318 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1319 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1320 20480 1321 :type sample_size: int (optional) 1322 """ 1323 1324 log.info("Loading...") 1325 1326 # change input file 1327 if input_file: 1328 self.set_input(input_file) 1329 self.set_header() 1330 1331 # drop variants table 1332 if drop_variants_table: 1333 self.drop_variants_table() 1334 1335 # get table variants 1336 table_variants = self.get_table_variants() 1337 1338 # Access 1339 access = self.get_config().get("access", None) 1340 log.debug(f"access: {access}") 1341 1342 # Input format and compress 1343 input_format = self.get_input_format() 1344 input_compressed = self.get_input_compressed() 1345 log.debug(f"input_format: {input_format}") 1346 log.debug(f"input_compressed: {input_compressed}") 1347 1348 # input_compressed_format 1349 if input_compressed: 1350 input_compressed_format = "gzip" 1351 else: 1352 input_compressed_format = "none" 1353 log.debug(f"input_compressed_format: {input_compressed_format}") 1354 1355 # Connexion format 1356 connexion_format = self.get_connexion_format() 1357 1358 # Sample size 1359 if not sample_size: 1360 sample_size = -1 1361 log.debug(f"sample_size: {sample_size}") 1362 1363 # Load data 1364 log.debug(f"Load Data from {input_format}") 1365 1366 # DuckDB connexion 1367 if connexion_format in ["duckdb"]: 1368 1369 # Database already exists 1370 if self.input_format in ["db", "duckdb"]: 1371 1372 if connexion_format in ["duckdb"]: 1373 log.debug(f"Input file format '{self.input_format}' duckDB") 1374 else: 1375 log.error( 1376 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1377 ) 1378 raise ValueError( 1379 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1380 ) 1381 1382 # Load from existing database format 1383 else: 1384 1385 try: 1386 # Create Table or View 1387 database = Database(database=self.input) 1388 sql_from = database.get_sql_from(sample_size=sample_size) 1389 1390 if access in ["RO"]: 1391 sql_load = ( 1392 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1393 ) 1394 else: 1395 sql_load = ( 1396 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1397 ) 1398 self.conn.execute(sql_load) 1399 1400 except: 1401 # Format not available 1402 log.error(f"Input file format '{self.input_format}' not available") 1403 raise ValueError( 1404 f"Input file format '{self.input_format}' not available" 1405 ) 1406 1407 # SQLite connexion 1408 elif connexion_format in ["sqlite"] and input_format in [ 1409 "vcf", 1410 "tsv", 1411 "csv", 1412 "psv", 1413 ]: 1414 1415 # Main structure 1416 structure = { 1417 "#CHROM": "VARCHAR", 1418 "POS": "INTEGER", 1419 "ID": "VARCHAR", 1420 "REF": "VARCHAR", 1421 "ALT": "VARCHAR", 1422 "QUAL": "VARCHAR", 1423 "FILTER": "VARCHAR", 1424 "INFO": "VARCHAR", 1425 } 1426 1427 # Strcuture with samples 1428 structure_complete = structure 1429 if self.get_header_sample_list(): 1430 structure["FORMAT"] = "VARCHAR" 1431 for sample in self.get_header_sample_list(): 1432 structure_complete[sample] = "VARCHAR" 1433 1434 # Columns list for create and insert 1435 sql_create_table_columns = [] 1436 sql_create_table_columns_list = [] 1437 for column in structure_complete: 1438 column_type = structure_complete[column] 1439 sql_create_table_columns.append( 1440 f'"{column}" {column_type} default NULL' 1441 ) 1442 sql_create_table_columns_list.append(f'"{column}"') 1443 1444 # Create database 1445 log.debug(f"Create Table {table_variants}") 1446 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1447 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1448 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1449 self.conn.execute(sql_create_table) 1450 1451 # chunksize define length of file chunk load file 1452 chunksize = 100000 1453 1454 # delimiter 1455 delimiter = file_format_delimiters.get(input_format, "\t") 1456 1457 # Load the input file 1458 with open(self.input, "rt") as input_file: 1459 1460 # Use the appropriate file handler based on the input format 1461 if input_compressed: 1462 input_file = bgzf.open(self.input, "rt") 1463 if input_format in ["vcf"]: 1464 header_len = self.get_header_length() 1465 else: 1466 header_len = 0 1467 1468 # Insert the file contents into a table 1469 self.insert_file_to_table( 1470 input_file, 1471 columns=sql_create_table_columns_list_sql, 1472 header_len=header_len, 1473 sep=delimiter, 1474 chunksize=chunksize, 1475 ) 1476 1477 else: 1478 log.error( 1479 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1480 ) 1481 raise ValueError( 1482 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1483 ) 1484 1485 # Explode INFOS fields into table fields 1486 if self.get_explode_infos(): 1487 self.explode_infos( 1488 prefix=self.get_explode_infos_prefix(), 1489 fields=self.get_explode_infos_fields(), 1490 force=True, 1491 ) 1492 1493 # Create index after insertion 1494 self.create_indexes()
The load_data function reads a VCF file and inserts it into a table, with options to drop the
table before loading the data and specify a sample size.
Parameters
- input_file: The path to the input file. This is the VCF file that will be loaded into the table
- drop_variants_table: The
drop_variants_tableparameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set toTrue, the variants table will be dropped. If set toFalse(default), the variants table will not be dropped, defaults to False - sample_size: The
sample_sizeparameter determines the number of rows to be sampled from the input file. If it is set toNone, the default value of 20480 will be used, defaults to 20480
1496 def get_explode_infos(self) -> bool: 1497 """ 1498 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1499 to False if it is not set. 1500 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1501 value. If the parameter is not present, it will return False. 1502 """ 1503 1504 return self.get_param().get("explode", {}).get("explode_infos", False)
The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting
to False if it is not set.
Returns
The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.
1506 def get_explode_infos_fields( 1507 self, 1508 explode_infos_fields: str = None, 1509 remove_fields_not_in_header: bool = False, 1510 ) -> list: 1511 """ 1512 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1513 the input parameter `explode_infos_fields`. 1514 1515 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1516 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1517 comma-separated list of field names to explode 1518 :type explode_infos_fields: str 1519 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1520 flag that determines whether to remove fields that are not present in the header. If it is set 1521 to `True`, any field that is not in the header will be excluded from the list of exploded 1522 information fields. If it is set to `, defaults to False 1523 :type remove_fields_not_in_header: bool (optional) 1524 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1525 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1526 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1527 Otherwise, it returns a list of exploded information fields after removing any spaces and 1528 splitting the string by commas. 1529 """ 1530 1531 # If no fields, get it in param 1532 if not explode_infos_fields: 1533 explode_infos_fields = ( 1534 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1535 ) 1536 1537 # If no fields, defined as all fields in header using keyword 1538 if not explode_infos_fields: 1539 explode_infos_fields = "*" 1540 1541 # If fields list not empty 1542 if explode_infos_fields: 1543 1544 # Input fields list 1545 if isinstance(explode_infos_fields, str): 1546 fields_input = explode_infos_fields.split(",") 1547 elif isinstance(explode_infos_fields, list): 1548 fields_input = explode_infos_fields 1549 else: 1550 fields_input = [] 1551 1552 # Fields list without * keyword 1553 fields_without_all = fields_input.copy() 1554 if "*".casefold() in (item.casefold() for item in fields_without_all): 1555 fields_without_all.remove("*") 1556 1557 # Fields in header 1558 fields_in_header = sorted(list(set(self.get_header().infos))) 1559 1560 # Construct list of fields 1561 fields_output = [] 1562 for field in fields_input: 1563 1564 # Strip field 1565 field = field.strip() 1566 1567 # format keyword * in regex 1568 if field.upper() in ["*"]: 1569 field = ".*" 1570 1571 # Find all fields with pattern 1572 r = re.compile(field) 1573 fields_search = sorted(list(filter(r.match, fields_in_header))) 1574 1575 # Remove fields input from search 1576 if field in fields_search: 1577 fields_search = [field] 1578 elif fields_search != [field]: 1579 fields_search = sorted( 1580 list(set(fields_search).difference(fields_input)) 1581 ) 1582 1583 # If field is not in header (avoid not well formatted header) 1584 if not fields_search and not remove_fields_not_in_header: 1585 fields_search = [field] 1586 1587 # Add found fields 1588 for new_field in fields_search: 1589 # Add field, if not already exists, and if it is in header (if asked) 1590 if ( 1591 new_field not in fields_output 1592 and ( 1593 not remove_fields_not_in_header 1594 or new_field in fields_in_header 1595 ) 1596 and new_field not in [".*"] 1597 ): 1598 fields_output.append(new_field) 1599 1600 return fields_output 1601 1602 else: 1603 1604 return []
The get_explode_infos_fields function returns a list of exploded information fields based on
the input parameter explode_infos_fields.
Parameters
- explode_infos_fields: The
explode_infos_fieldsparameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode - remove_fields_not_in_header: The parameter
remove_fields_not_in_headeris a boolean flag that determines whether to remove fields that are not present in the header. If it is set toTrue, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns
The function
get_explode_infos_fieldsreturns a list of exploded information fields. If theexplode_infos_fieldsparameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.
1606 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1607 """ 1608 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1609 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1610 not provided. 1611 1612 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1613 prefix to be used for exploding or expanding information 1614 :type explode_infos_prefix: str 1615 :return: the value of the variable `explode_infos_prefix`. 1616 """ 1617 1618 if not explode_infos_prefix: 1619 explode_infos_prefix = ( 1620 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1621 ) 1622 1623 return explode_infos_prefix
The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or
the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is
not provided.
Parameters
- explode_infos_prefix: The parameter
explode_infos_prefixis a string that specifies a prefix to be used for exploding or expanding information
Returns
the value of the variable
explode_infos_prefix.
1625 def add_column( 1626 self, 1627 table_name, 1628 column_name, 1629 column_type, 1630 default_value=None, 1631 drop: bool = False, 1632 ) -> dict: 1633 """ 1634 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1635 doesn't already exist. 1636 1637 :param table_name: The name of the table to which you want to add a column 1638 :param column_name: The parameter "column_name" is the name of the column that you want to add 1639 to the table 1640 :param column_type: The `column_type` parameter specifies the data type of the column that you 1641 want to add to the table. It should be a string that represents the desired data type, such as 1642 "INTEGER", "TEXT", "REAL", etc 1643 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1644 default value for the newly added column. If a default value is provided, it will be assigned to 1645 the column for any existing rows that do not have a value for that column 1646 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1647 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1648 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1649 to False 1650 :type drop: bool (optional) 1651 :return: a boolean value indicating whether the column was successfully added to the table. 1652 """ 1653 1654 # added 1655 added = False 1656 dropped = False 1657 1658 # Check if the column already exists in the table 1659 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1660 columns = self.get_query_to_df(query).columns.tolist() 1661 if column_name.upper() in [c.upper() for c in columns]: 1662 log.debug( 1663 f"The {column_name} column already exists in the {table_name} table" 1664 ) 1665 if drop: 1666 self.drop_column(table_name=table_name, column_name=column_name) 1667 dropped = True 1668 else: 1669 return None 1670 else: 1671 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1672 1673 # Add column in table 1674 add_column_query = ( 1675 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1676 ) 1677 if default_value is not None: 1678 add_column_query += f" DEFAULT {default_value}" 1679 self.execute_query(add_column_query) 1680 added = not dropped 1681 log.debug( 1682 f"The {column_name} column was successfully added to the {table_name} table" 1683 ) 1684 1685 if added: 1686 added_column = { 1687 "table_name": table_name, 1688 "column_name": column_name, 1689 "column_type": column_type, 1690 "default_value": default_value, 1691 } 1692 else: 1693 added_column = None 1694 1695 return added_column
The add_column function adds a column to a SQLite or DuckDB table with a default value if it
doesn't already exist.
Parameters
- table_name: The name of the table to which you want to add a column
- column_name: The parameter "column_name" is the name of the column that you want to add to the table
- column_type: The
column_typeparameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc - default_value: The
default_valueparameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column - drop: The
dropparameter is a boolean flag that determines whether to drop the column if it already exists in the table. Ifdropis set toTrue, the function will drop the existing column before adding the new column. Ifdropis set toFalse(default),, defaults to False
Returns
a boolean value indicating whether the column was successfully added to the table.
1697 def drop_column( 1698 self, column: dict = None, table_name: str = None, column_name: str = None 1699 ) -> bool: 1700 """ 1701 The `drop_column` function drops a specified column from a given table in a database and returns 1702 True if the column was successfully dropped, and False if the column does not exist in the 1703 table. 1704 1705 :param column: The `column` parameter is a dictionary that contains information about the column 1706 you want to drop. It has two keys: 1707 :type column: dict 1708 :param table_name: The `table_name` parameter is the name of the table from which you want to 1709 drop a column 1710 :type table_name: str 1711 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1712 from the table 1713 :type column_name: str 1714 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1715 and False if the column does not exist in the table. 1716 """ 1717 1718 # Find column infos 1719 if column: 1720 if isinstance(column, dict): 1721 table_name = column.get("table_name", None) 1722 column_name = column.get("column_name", None) 1723 elif isinstance(column, str): 1724 table_name = self.get_table_variants() 1725 column_name = column 1726 else: 1727 table_name = None 1728 column_name = None 1729 1730 if not table_name and not column_name: 1731 return False 1732 1733 # Removed 1734 removed = False 1735 1736 # Check if the column already exists in the table 1737 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1738 columns = self.get_query_to_df(query).columns.tolist() 1739 if column_name in columns: 1740 log.debug(f"The {column_name} column exists in the {table_name} table") 1741 else: 1742 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1743 return False 1744 1745 # Add column in table # ALTER TABLE integers DROP k 1746 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1747 self.execute_query(add_column_query) 1748 removed = True 1749 log.debug( 1750 f"The {column_name} column was successfully dropped to the {table_name} table" 1751 ) 1752 1753 return removed
The drop_column function drops a specified column from a given table in a database and returns
True if the column was successfully dropped, and False if the column does not exist in the
table.
Parameters
- column: The
columnparameter is a dictionary that contains information about the column you want to drop. It has two keys: - table_name: The
table_nameparameter is the name of the table from which you want to drop a column - column_name: The
column_nameparameter is the name of the column that you want to drop from the table
Returns
a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.
1755 def explode_infos( 1756 self, 1757 prefix: str = None, 1758 create_index: bool = False, 1759 fields: list = None, 1760 force: bool = False, 1761 proccess_all_fields_together: bool = False, 1762 table: str = None, 1763 ) -> list: 1764 """ 1765 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1766 individual columns, returning a list of added columns. 1767 1768 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1769 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1770 `self.get_explode_infos_prefix()` as the prefix 1771 :type prefix: str 1772 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1773 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1774 `False`, indexes will not be created. The default value is `False`, defaults to False 1775 :type create_index: bool (optional) 1776 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1777 that you want to explode into individual columns. If this parameter is not provided, all INFO 1778 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1779 a list to the ` 1780 :type fields: list 1781 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1782 determines whether to drop and recreate a column if it already exists in the table. If `force` 1783 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1784 defaults to False 1785 :type force: bool (optional) 1786 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1787 flag that determines whether to process all the INFO fields together or individually. If set to 1788 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1789 be processed individually. The default value is, defaults to False 1790 :type proccess_all_fields_together: bool (optional) 1791 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1792 of the table where the exploded INFO fields will be added as individual columns. If you provide 1793 a value for the `table` parameter, the function will use that table name. If the `table` 1794 parameter is 1795 :type table: str 1796 :return: The `explode_infos` function returns a list of added columns. 1797 """ 1798 1799 # drop indexes 1800 self.drop_indexes() 1801 1802 # connexion format 1803 connexion_format = self.get_connexion_format() 1804 1805 # Access 1806 access = self.get_config().get("access", None) 1807 1808 # Added columns 1809 added_columns = [] 1810 1811 if access not in ["RO"]: 1812 1813 # prefix 1814 if prefix in [None, True] or not isinstance(prefix, str): 1815 if self.get_explode_infos_prefix() not in [None, True]: 1816 prefix = self.get_explode_infos_prefix() 1817 else: 1818 prefix = "INFO/" 1819 1820 # table variants 1821 if table is not None: 1822 table_variants = table 1823 else: 1824 table_variants = self.get_table_variants(clause="select") 1825 1826 # extra infos 1827 try: 1828 extra_infos = self.get_extra_infos() 1829 except: 1830 extra_infos = [] 1831 1832 # Header infos 1833 header_infos = self.get_header().infos 1834 1835 log.debug( 1836 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1837 ) 1838 1839 sql_info_alter_table_array = [] 1840 1841 # Info fields to check 1842 fields_list = list(header_infos) 1843 if fields: 1844 fields_list += fields 1845 fields_list = set(fields_list) 1846 1847 # If no fields 1848 if not fields: 1849 fields = [] 1850 1851 # Translate fields if patterns 1852 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1853 1854 for info in fields: 1855 1856 info_id_sql = prefix + info 1857 1858 if ( 1859 info in fields_list 1860 or prefix + info in fields_list 1861 or info in extra_infos 1862 ): 1863 1864 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1865 1866 if info in header_infos: 1867 info_type = header_infos[info].type 1868 info_num = header_infos[info].num 1869 else: 1870 info_type = "String" 1871 info_num = 0 1872 1873 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1874 if info_num != 1: 1875 type_sql = "VARCHAR" 1876 1877 # Add field 1878 added_column = self.add_column( 1879 table_name=table_variants, 1880 column_name=info_id_sql, 1881 column_type=type_sql, 1882 default_value="null", 1883 drop=force, 1884 ) 1885 1886 if added_column: 1887 added_columns.append(added_column) 1888 1889 if added_column or force: 1890 1891 # add field to index 1892 self.index_additionnal_fields.append(info_id_sql) 1893 1894 # Update field array 1895 if connexion_format in ["duckdb"]: 1896 update_info_field = f""" 1897 "{info_id_sql}" = 1898 CASE 1899 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1900 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1901 END 1902 """ 1903 elif connexion_format in ["sqlite"]: 1904 update_info_field = f""" 1905 "{info_id_sql}" = 1906 CASE 1907 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1908 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1909 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1910 END 1911 """ 1912 1913 sql_info_alter_table_array.append(update_info_field) 1914 1915 if sql_info_alter_table_array: 1916 1917 # By chromosomes 1918 try: 1919 chromosomes_list = list( 1920 self.get_query_to_df( 1921 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1922 )["#CHROM"] 1923 ) 1924 except: 1925 chromosomes_list = [None] 1926 1927 for chrom in chromosomes_list: 1928 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1929 1930 # Where clause 1931 where_clause = "" 1932 if chrom and len(chromosomes_list) > 1: 1933 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1934 1935 # Update table 1936 if proccess_all_fields_together: 1937 sql_info_alter_table_array_join = ", ".join( 1938 sql_info_alter_table_array 1939 ) 1940 if sql_info_alter_table_array_join: 1941 sql_info_alter_table = f""" 1942 UPDATE {table_variants} 1943 SET {sql_info_alter_table_array_join} 1944 {where_clause} 1945 """ 1946 log.debug( 1947 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1948 ) 1949 # log.debug(sql_info_alter_table) 1950 self.conn.execute(sql_info_alter_table) 1951 else: 1952 sql_info_alter_num = 0 1953 for sql_info_alter in sql_info_alter_table_array: 1954 sql_info_alter_num += 1 1955 sql_info_alter_table = f""" 1956 UPDATE {table_variants} 1957 SET {sql_info_alter} 1958 {where_clause} 1959 """ 1960 log.debug( 1961 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1962 ) 1963 # log.debug(sql_info_alter_table) 1964 self.conn.execute(sql_info_alter_table) 1965 1966 # create indexes 1967 if create_index: 1968 self.create_indexes() 1969 1970 return added_columns
The explode_infos function in Python takes a VCF file and explodes the INFO fields into
individual columns, returning a list of added columns.
Parameters
- prefix: The
prefixparameter is a string that is used as a prefix for the exploded INFO fields. If theprefixis not provided or is set toNone, the function will use the value ofself.get_explode_infos_prefix()as the prefix - create_index: The
create_indexparameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set toTrue, indexes will be created; if set toFalse, indexes will not be created. The default value isFalse, defaults to False - fields: The
fieldsparameter in theexplode_infosfunction is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the ` - force: The
forceparameter in theexplode_infosfunction is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. Ifforceis set toTrue, the column will be dropped and recreated. Ifforceis set to `False, defaults to False - proccess_all_fields_together: The
proccess_all_fields_togetherparameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set toTrue, all the INFO fields will be processed together. If set toFalse, each INFO field will be processed individually. The default value is, defaults to False - table: The
tableparameter in theexplode_infosfunction is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for thetableparameter, the function will use that table name. If thetableparameter is
Returns
The
explode_infosfunction returns a list of added columns.
1972 def create_indexes(self) -> None: 1973 """ 1974 Create indexes on the table after insertion 1975 """ 1976 1977 # Access 1978 access = self.get_config().get("access", None) 1979 1980 # get table variants 1981 table_variants = self.get_table_variants("FROM") 1982 1983 if self.get_indexing() and access not in ["RO"]: 1984 # Create index 1985 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1986 self.conn.execute(sql_create_table_index) 1987 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1988 self.conn.execute(sql_create_table_index) 1989 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1990 self.conn.execute(sql_create_table_index) 1991 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1992 self.conn.execute(sql_create_table_index) 1993 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1994 self.conn.execute(sql_create_table_index) 1995 for field in self.index_additionnal_fields: 1996 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1997 self.conn.execute(sql_create_table_index)
Create indexes on the table after insertion
1999 def drop_indexes(self) -> None: 2000 """ 2001 Create indexes on the table after insertion 2002 """ 2003 2004 # Access 2005 access = self.get_config().get("access", None) 2006 2007 # get table variants 2008 table_variants = self.get_table_variants("FROM") 2009 2010 # Get database format 2011 connexion_format = self.get_connexion_format() 2012 2013 if access not in ["RO"]: 2014 if connexion_format in ["duckdb"]: 2015 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2016 elif connexion_format in ["sqlite"]: 2017 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2018 2019 list_indexes = self.conn.execute(sql_list_indexes) 2020 index_names = [row[0] for row in list_indexes.fetchall()] 2021 for index in index_names: 2022 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2023 self.conn.execute(sql_drop_table_index)
Create indexes on the table after insertion
2025 def read_vcf_header(self, f) -> list: 2026 """ 2027 It reads the header of a VCF file and returns a list of the header lines 2028 2029 :param f: the file object 2030 :return: The header lines of the VCF file. 2031 """ 2032 2033 header_list = [] 2034 for line in f: 2035 header_list.append(line) 2036 if line.startswith("#CHROM"): 2037 break 2038 return header_list
It reads the header of a VCF file and returns a list of the header lines
Parameters
- f: the file object
Returns
The header lines of the VCF file.
2040 def read_vcf_header_file(self, file: str = None) -> list: 2041 """ 2042 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2043 uncompressed files. 2044 2045 :param file: The `file` parameter is a string that represents the path to the VCF header file 2046 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2047 default to `None` 2048 :type file: str 2049 :return: The function `read_vcf_header_file` returns a list. 2050 """ 2051 2052 if self.get_input_compressed(input_file=file): 2053 with bgzf.open(file, "rt") as f: 2054 return self.read_vcf_header(f=f) 2055 else: 2056 with open(file, "rt") as f: 2057 return self.read_vcf_header(f=f)
The read_vcf_header_file function reads the header of a VCF file, handling both compressed and
uncompressed files.
Parameters
- file: The
fileparameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default toNone
Returns
The function
read_vcf_header_filereturns a list.
2059 def execute_query(self, query: str): 2060 """ 2061 It takes a query as an argument, executes it, and returns the results 2062 2063 :param query: The query to be executed 2064 :return: The result of the query is being returned. 2065 """ 2066 if query: 2067 return self.conn.execute(query) # .fetchall() 2068 else: 2069 return None
It takes a query as an argument, executes it, and returns the results
Parameters
- query: The query to be executed
Returns
The result of the query is being returned.
2071 def export_output( 2072 self, 2073 output_file: str | None = None, 2074 output_header: str | None = None, 2075 export_header: bool = True, 2076 query: str | None = None, 2077 parquet_partitions: list | None = None, 2078 chunk_size: int | None = None, 2079 threads: int | None = None, 2080 sort: bool = False, 2081 index: bool = False, 2082 order_by: str | None = None, 2083 ) -> bool: 2084 """ 2085 The `export_output` function exports data from a VCF file to a specified output file in various 2086 formats, including VCF, CSV, TSV, PSV, and Parquet. 2087 2088 :param output_file: The `output_file` parameter is a string that specifies the name of the 2089 output file to be generated by the function. This is where the exported data will be saved 2090 :type output_file: str 2091 :param output_header: The `output_header` parameter is a string that specifies the name of the 2092 file where the header of the VCF file will be exported. If this parameter is not provided, the 2093 header will be exported to a file with the same name as the `output_file` parameter, but with 2094 the extension " 2095 :type output_header: str 2096 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2097 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2098 True, the header will be exported to a file. If `export_header` is False, the header will not 2099 be, defaults to True, if output format is not VCF 2100 :type export_header: bool (optional) 2101 :param query: The `query` parameter is an optional SQL query that can be used to filter and 2102 select specific data from the VCF file before exporting it. If provided, only the data that 2103 matches the query will be exported 2104 :type query: str 2105 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2106 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2107 organize data in a hierarchical directory structure based on the values of one or more columns. 2108 This can improve query performance when working with large datasets 2109 :type parquet_partitions: list 2110 :param chunk_size: The `chunk_size` parameter specifies the number of 2111 records in batch when exporting data in Parquet format. This parameter is used for 2112 partitioning the Parquet file into multiple files. 2113 :type chunk_size: int 2114 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2115 threads to be used during the export process. It determines the level of parallelism and can 2116 improve the performance of the export operation. If not provided, the function will use the 2117 default number of threads 2118 :type threads: int 2119 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2120 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2121 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2122 False 2123 :type sort: bool (optional) 2124 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2125 created on the output file. If `index` is True, an index will be created. If `index` is False, 2126 no index will be created. The default value is False, defaults to False 2127 :type index: bool (optional) 2128 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2129 sorting the output file. This parameter is only applicable when exporting data in VCF format 2130 :type order_by: str 2131 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2132 None if it doesn't. 2133 """ 2134 2135 # Log 2136 log.info("Exporting...") 2137 2138 # Full path 2139 output_file = full_path(output_file) 2140 output_header = full_path(output_header) 2141 2142 # Config 2143 config = self.get_config() 2144 2145 # Param 2146 param = self.get_param() 2147 2148 # Tmp files to remove 2149 tmp_to_remove = [] 2150 2151 # If no output, get it 2152 if not output_file: 2153 output_file = self.get_output() 2154 2155 # If not threads 2156 if not threads: 2157 threads = self.get_threads() 2158 2159 # Auto header name with extension 2160 if export_header or output_header: 2161 if not output_header: 2162 output_header = f"{output_file}.hdr" 2163 # Export header 2164 self.export_header(output_file=output_file) 2165 2166 # Switch off export header if VCF output 2167 output_file_type = get_file_format(output_file) 2168 if output_file_type in ["vcf"]: 2169 export_header = False 2170 tmp_to_remove.append(output_header) 2171 2172 # Chunk size 2173 if not chunk_size: 2174 chunk_size = config.get("chunk_size", None) 2175 2176 # Parquet partition 2177 if not parquet_partitions: 2178 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2179 if parquet_partitions and isinstance(parquet_partitions, str): 2180 parquet_partitions = parquet_partitions.split(",") 2181 2182 # Order by 2183 if not order_by: 2184 order_by = param.get("export", {}).get("order_by", "") 2185 2186 # Header in output 2187 header_in_output = param.get("export", {}).get("include_header", False) 2188 2189 # Database 2190 database_source = self.get_connexion() 2191 2192 # Connexion format 2193 connexion_format = self.get_connexion_format() 2194 2195 # Explode infos 2196 if self.get_explode_infos(): 2197 self.explode_infos( 2198 prefix=self.get_explode_infos_prefix(), 2199 fields=self.get_explode_infos_fields(), 2200 force=False, 2201 ) 2202 2203 # if connexion_format in ["sqlite"] or query: 2204 if connexion_format in ["sqlite"]: 2205 2206 # Export in Parquet 2207 random_tmp = "".join( 2208 random.choice(string.ascii_lowercase) for i in range(10) 2209 ) 2210 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2211 tmp_to_remove.append(database_source) 2212 2213 # Table Variants 2214 table_variants = self.get_table_variants() 2215 2216 # Create export query 2217 sql_query_export_subquery = f""" 2218 SELECT * FROM {table_variants} 2219 """ 2220 2221 # Write source file 2222 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2223 2224 # Create database 2225 database = Database( 2226 database=database_source, 2227 table="variants", 2228 header_file=output_header, 2229 conn_config=self.get_connexion_config(), 2230 ) 2231 2232 # Existing colomns header 2233 existing_columns_header = database.get_header_columns_from_database() 2234 2235 # Sample list 2236 if output_file_type in ["vcf"]: 2237 get_samples = self.get_samples() 2238 get_samples_check = self.get_samples_check() 2239 samples_force = get_samples is not None 2240 sample_list = self.get_header_sample_list( 2241 check=get_samples_check, 2242 samples=get_samples, 2243 samples_force=samples_force, 2244 ) 2245 else: 2246 sample_list = None 2247 2248 # Export file 2249 database.export( 2250 output_database=output_file, 2251 output_header=output_header, 2252 existing_columns_header=existing_columns_header, 2253 parquet_partitions=parquet_partitions, 2254 chunk_size=chunk_size, 2255 threads=threads, 2256 sort=sort, 2257 index=index, 2258 header_in_output=header_in_output, 2259 order_by=order_by, 2260 query=query, 2261 export_header=export_header, 2262 sample_list=sample_list, 2263 ) 2264 2265 # Remove 2266 remove_if_exists(tmp_to_remove) 2267 2268 return (os.path.exists(output_file) or None) and ( 2269 os.path.exists(output_file) or None 2270 )
The export_output function exports data from a VCF file to a specified output file in various
formats, including VCF, CSV, TSV, PSV, and Parquet.
Parameters
- output_file: The
output_fileparameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved - output_header: The
output_headerparameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as theoutput_fileparameter, but with the extension " - export_header: The
export_headerparameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. Ifexport_headeris True, the header will be exported to a file. Ifexport_headeris False, the header will not be, defaults to True, if output format is not VCF - query: The
queryparameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported - parquet_partitions: The
parquet_partitionsparameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets - chunk_size: The
chunk_sizeparameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. - threads: The
threadsparameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads - sort: The
sortparameter is a boolean flag that determines whether the output file should be sorted or not. Ifsortis set toTrue, the output file will be sorted based on the genomic coordinates of the variants. By default, the value ofsortisFalse, defaults to False - index: The
indexparameter is a boolean flag that determines whether an index should be created on the output file. Ifindexis True, an index will be created. Ifindexis False, no index will be created. The default value is False, defaults to False - order_by: The
order_byparameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns
a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.
2272 def get_extra_infos(self, table: str = None) -> list: 2273 """ 2274 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2275 in the header. 2276 2277 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2278 name of the table from which you want to retrieve the extra columns that are not present in the 2279 header. If the `table` parameter is not provided when calling the function, it will default to 2280 using the variants 2281 :type table: str 2282 :return: A list of columns that are in the specified table but not in the header of the table. 2283 """ 2284 2285 header_columns = [] 2286 2287 if not table: 2288 table = self.get_table_variants(clause="from") 2289 header_columns = self.get_header_columns() 2290 2291 # Check all columns in the database 2292 query = f""" SELECT * FROM {table} LIMIT 1 """ 2293 log.debug(f"query {query}") 2294 table_columns = self.get_query_to_df(query).columns.tolist() 2295 extra_columns = [] 2296 2297 # Construct extra infos (not in header) 2298 for column in table_columns: 2299 if column not in header_columns: 2300 extra_columns.append(column) 2301 2302 return extra_columns
The get_extra_infos function returns a list of columns that are in a specified table but not
in the header.
Parameters
- table: The
tableparameter in theget_extra_infosfunction is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If thetableparameter is not provided when calling the function, it will default to using the variants
Returns
A list of columns that are in the specified table but not in the header of the table.
2304 def get_extra_infos_sql(self, table: str = None) -> str: 2305 """ 2306 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2307 by double quotes 2308 2309 :param table: The name of the table to get the extra infos from. If None, the default table is 2310 used 2311 :type table: str 2312 :return: A string of the extra infos 2313 """ 2314 2315 return ", ".join( 2316 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2317 )
It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes
Parameters
- table: The name of the table to get the extra infos from. If None, the default table is used
Returns
A string of the extra infos
2319 def export_header( 2320 self, 2321 header_name: str = None, 2322 output_file: str = None, 2323 output_file_ext: str = ".hdr", 2324 clean_header: bool = True, 2325 remove_chrom_line: bool = False, 2326 ) -> str: 2327 """ 2328 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2329 specified options, and writes it to a new file. 2330 2331 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2332 this parameter is not specified, the header will be written to the output file 2333 :type header_name: str 2334 :param output_file: The `output_file` parameter in the `export_header` function is used to 2335 specify the name of the output file where the header will be written. If this parameter is not 2336 provided, the header will be written to a temporary file 2337 :type output_file: str 2338 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2339 string that represents the extension of the output header file. By default, it is set to ".hdr" 2340 if not specified by the user. This extension will be appended to the `output_file` name to 2341 create the final, defaults to .hdr 2342 :type output_file_ext: str (optional) 2343 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2344 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2345 `True`, the function will clean the header by modifying certain lines based on a specific 2346 pattern. If `clean_header`, defaults to True 2347 :type clean_header: bool (optional) 2348 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2349 boolean flag that determines whether the #CHROM line should be removed from the header before 2350 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2351 defaults to False 2352 :type remove_chrom_line: bool (optional) 2353 :return: The function `export_header` returns the name of the temporary header file that is 2354 created. 2355 """ 2356 2357 if not header_name and not output_file: 2358 output_file = self.get_output() 2359 2360 if self.get_header(): 2361 2362 # Get header object 2363 header_obj = self.get_header() 2364 2365 # Create database 2366 db_for_header = Database(database=self.get_input()) 2367 2368 # Get real columns in the file 2369 db_header_columns = db_for_header.get_columns() 2370 2371 with tempfile.TemporaryDirectory() as tmpdir: 2372 2373 # Write header file 2374 header_file_tmp = os.path.join(tmpdir, "header") 2375 f = open(header_file_tmp, "w") 2376 vcf.Writer(f, header_obj) 2377 f.close() 2378 2379 # Replace #CHROM line with rel columns 2380 header_list = db_for_header.read_header_file( 2381 header_file=header_file_tmp 2382 ) 2383 header_list[-1] = "\t".join(db_header_columns) 2384 2385 # Remove CHROM line 2386 if remove_chrom_line: 2387 header_list.pop() 2388 2389 # Clean header 2390 if clean_header: 2391 header_list_clean = [] 2392 for head in header_list: 2393 # Clean head for malformed header 2394 head_clean = head 2395 head_clean = re.subn( 2396 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2397 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2398 head_clean, 2399 2, 2400 )[0] 2401 # Write header 2402 header_list_clean.append(head_clean) 2403 header_list = header_list_clean 2404 2405 tmp_header_name = output_file + output_file_ext 2406 2407 f = open(tmp_header_name, "w") 2408 for line in header_list: 2409 f.write(line) 2410 f.close() 2411 2412 return tmp_header_name
The export_header function takes a VCF file, extracts the header, modifies it according to
specified options, and writes it to a new file.
Parameters
- header_name: The
header_nameparameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file - output_file: The
output_fileparameter in theexport_headerfunction is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file - output_file_ext: The
output_file_extparameter in theexport_headerfunction is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to theoutput_filename to create the final, defaults to .hdr - clean_header: The
clean_headerparameter in theexport_headerfunction is a boolean flag that determines whether the header should be cleaned or not. Whenclean_headeris set toTrue, the function will clean the header by modifying certain lines based on a specific pattern. Ifclean_header, defaults to True - remove_chrom_line: The
remove_chrom_lineparameter in theexport_headerfunction is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set toTrue, the #CHROM line will be removed; if set to `, defaults to False
Returns
The function
export_headerreturns the name of the temporary header file that is created.
2414 def export_variant_vcf( 2415 self, 2416 vcf_file, 2417 remove_info: bool = False, 2418 add_samples: bool = True, 2419 list_samples: list = [], 2420 where_clause: str = "", 2421 index: bool = False, 2422 threads: int | None = None, 2423 ) -> bool | None: 2424 """ 2425 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2426 remove INFO field, add samples, and control compression and indexing. 2427 2428 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2429 written to. It is the output file that will contain the filtered VCF data based on the specified 2430 parameters 2431 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2432 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2433 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2434 in, defaults to False 2435 :type remove_info: bool (optional) 2436 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2437 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2438 If set to False, the samples will be removed. The default value is True, defaults to True 2439 :type add_samples: bool (optional) 2440 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2441 in the output VCF file. By default, all samples will be included. If you provide a list of 2442 samples, only those samples will be included in the output file 2443 :type list_samples: list 2444 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2445 determines whether or not to create an index for the output VCF file. If `index` is set to 2446 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2447 :type index: bool (optional) 2448 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2449 number of threads to use for exporting the VCF file. It determines how many parallel threads 2450 will be used during the export process. More threads can potentially speed up the export process 2451 by utilizing multiple cores of the processor. If 2452 :type threads: int | None 2453 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2454 method with various parameters including the output file, query, threads, sort flag, and index 2455 flag. The `export_output` method is responsible for exporting the VCF data based on the 2456 specified parameters and configurations provided in the `export_variant_vcf` function. 2457 """ 2458 2459 # Config 2460 config = self.get_config() 2461 2462 # Extract VCF 2463 log.debug("Export VCF...") 2464 2465 # Table variants 2466 table_variants = self.get_table_variants() 2467 2468 # Threads 2469 if not threads: 2470 threads = self.get_threads() 2471 2472 # Info fields 2473 if remove_info: 2474 if not isinstance(remove_info, str): 2475 remove_info = "." 2476 info_field = f"""'{remove_info}' as INFO""" 2477 else: 2478 info_field = "INFO" 2479 2480 # Samples fields 2481 if add_samples: 2482 if not list_samples: 2483 list_samples = self.get_header_sample_list() 2484 if list_samples: 2485 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2486 else: 2487 samples_fields = "" 2488 log.debug(f"samples_fields: {samples_fields}") 2489 else: 2490 samples_fields = "" 2491 2492 # Where clause 2493 if where_clause is None: 2494 where_clause = "" 2495 2496 # Variants 2497 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2498 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2499 log.debug(f"sql_query_select={sql_query_select}") 2500 2501 return self.export_output( 2502 output_file=vcf_file, 2503 output_header=None, 2504 export_header=True, 2505 query=sql_query_select, 2506 parquet_partitions=None, 2507 chunk_size=config.get("chunk_size", None), 2508 threads=threads, 2509 sort=True, 2510 index=index, 2511 order_by=None, 2512 )
The export_variant_vcf function exports a VCF file with specified samples, allowing options to
remove INFO field, add samples, and control compression and indexing.
Parameters
- vcf_file: The
vcf_fileparameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters - remove_info: The
remove_infoparameter in theexport_variant_vcffunction is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set toTrue, the INFO field will be removed. If set toFalse, the INFO field will be included in, defaults to False - add_samples: The
add_samplesparameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True - list_samples: The
list_samplesparameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file - index: The
indexparameter in theexport_variant_vcffunction is a boolean flag that determines whether or not to create an index for the output VCF file. Ifindexis set toTrue, the output VCF file will be indexed using tabix. Ifindex, defaults to False - threads: The
threadsparameter in theexport_variant_vcffunction specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns
The
export_variant_vcffunction returns the result of calling theexport_outputmethod with various parameters including the output file, query, threads, sort flag, and index flag. Theexport_outputmethod is responsible for exporting the VCF data based on the specified parameters and configurations provided in theexport_variant_vcffunction.
2514 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2515 """ 2516 It takes a list of commands and runs them in parallel using the number of threads specified 2517 2518 :param commands: A list of commands to run 2519 :param threads: The number of threads to use, defaults to 1 (optional) 2520 """ 2521 2522 run_parallel_commands(commands, threads)
It takes a list of commands and runs them in parallel using the number of threads specified
Parameters
- commands: A list of commands to run
- threads: The number of threads to use, defaults to 1 (optional)
2524 def get_threads(self, default: int = 1) -> int: 2525 """ 2526 This function returns the number of threads to use for a job, with a default value of 1 if not 2527 specified. 2528 2529 :param default: The `default` parameter in the `get_threads` method is used to specify the 2530 default number of threads to use if no specific value is provided. If no value is provided for 2531 the `threads` parameter in the configuration or input parameters, the `default` value will be 2532 used, defaults to 1 2533 :type default: int (optional) 2534 :return: the number of threads to use for the current job. 2535 """ 2536 2537 # Config 2538 config = self.get_config() 2539 2540 # Param 2541 param = self.get_param() 2542 2543 # Input threads 2544 input_thread = param.get("threads", config.get("threads", None)) 2545 2546 # Check threads 2547 if not input_thread: 2548 threads = default 2549 elif int(input_thread) <= 0: 2550 threads = os.cpu_count() 2551 else: 2552 threads = int(input_thread) 2553 return threads
This function returns the number of threads to use for a job, with a default value of 1 if not specified.
Parameters
- default: The
defaultparameter in theget_threadsmethod is used to specify the default number of threads to use if no specific value is provided. If no value is provided for thethreadsparameter in the configuration or input parameters, thedefaultvalue will be used, defaults to 1
Returns
the number of threads to use for the current job.
2555 def get_memory(self, default: str = None) -> str: 2556 """ 2557 This function retrieves the memory value from parameters or configuration with a default value 2558 if not found. 2559 2560 :param default: The `get_memory` function takes in a default value as a string parameter. This 2561 default value is used as a fallback in case the `memory` parameter is not provided in the 2562 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2563 the function 2564 :type default: str 2565 :return: The `get_memory` function returns a string value representing the memory parameter. If 2566 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2567 return the default value provided as an argument to the function. 2568 """ 2569 2570 # Config 2571 config = self.get_config() 2572 2573 # Param 2574 param = self.get_param() 2575 2576 # Input threads 2577 input_memory = param.get("memory", config.get("memory", None)) 2578 2579 # Check threads 2580 if input_memory: 2581 memory = input_memory 2582 else: 2583 memory = default 2584 2585 return memory
This function retrieves the memory value from parameters or configuration with a default value if not found.
Parameters
- default: The
get_memoryfunction takes in a default value as a string parameter. This default value is used as a fallback in case thememoryparameter is not provided in theparamdictionary or theconfigdictionary. Ifmemoryis not found in either dictionary, the function
Returns
The
get_memoryfunction returns a string value representing the memory parameter. If theinput_memoryis provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.
2587 def update_from_vcf(self, vcf_file: str) -> None: 2588 """ 2589 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2590 2591 :param vcf_file: the path to the VCF file 2592 """ 2593 2594 connexion_format = self.get_connexion_format() 2595 2596 if connexion_format in ["duckdb"]: 2597 self.update_from_vcf_duckdb(vcf_file) 2598 elif connexion_format in ["sqlite"]: 2599 self.update_from_vcf_sqlite(vcf_file)
If the database is duckdb, then use the parquet method, otherwise use the sqlite method
Parameters
- vcf_file: the path to the VCF file
2601 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2602 """ 2603 It takes a VCF file and updates the INFO column of the variants table in the database with the 2604 INFO column of the VCF file 2605 2606 :param vcf_file: the path to the VCF file 2607 """ 2608 2609 # varaints table 2610 table_variants = self.get_table_variants() 2611 2612 # Loading VCF into temporaire table 2613 skip = self.get_header_length(file=vcf_file) 2614 vcf_df = pd.read_csv( 2615 vcf_file, 2616 sep="\t", 2617 engine="c", 2618 skiprows=skip, 2619 header=0, 2620 low_memory=False, 2621 ) 2622 sql_query_update = f""" 2623 UPDATE {table_variants} as table_variants 2624 SET INFO = concat( 2625 CASE 2626 WHEN INFO NOT IN ('', '.') 2627 THEN INFO 2628 ELSE '' 2629 END, 2630 ( 2631 SELECT 2632 concat( 2633 CASE 2634 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2635 THEN ';' 2636 ELSE '' 2637 END 2638 , 2639 CASE 2640 WHEN table_parquet.INFO NOT IN ('','.') 2641 THEN table_parquet.INFO 2642 ELSE '' 2643 END 2644 ) 2645 FROM vcf_df as table_parquet 2646 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2647 AND table_parquet.\"POS\" = table_variants.\"POS\" 2648 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2649 AND table_parquet.\"REF\" = table_variants.\"REF\" 2650 AND table_parquet.INFO NOT IN ('','.') 2651 ) 2652 ) 2653 ; 2654 """ 2655 self.conn.execute(sql_query_update)
It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file
Parameters
- vcf_file: the path to the VCF file
2657 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2658 """ 2659 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2660 table, then updates the INFO column of the variants table with the INFO column of the temporary 2661 table 2662 2663 :param vcf_file: The path to the VCF file you want to update the database with 2664 """ 2665 2666 # Create a temporary table for the VCF 2667 table_vcf = "tmp_vcf" 2668 sql_create = ( 2669 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2670 ) 2671 self.conn.execute(sql_create) 2672 2673 # Loading VCF into temporaire table 2674 vcf_df = pd.read_csv( 2675 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2676 ) 2677 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2678 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2679 2680 # Update table 'variants' with VCF data 2681 # warning: CONCAT as || operator 2682 sql_query_update = f""" 2683 UPDATE variants as table_variants 2684 SET INFO = CASE 2685 WHEN INFO NOT IN ('', '.') 2686 THEN INFO 2687 ELSE '' 2688 END || 2689 ( 2690 SELECT 2691 CASE 2692 WHEN table_variants.INFO NOT IN ('','.') 2693 AND table_vcf.INFO NOT IN ('','.') 2694 THEN ';' 2695 ELSE '' 2696 END || 2697 CASE 2698 WHEN table_vcf.INFO NOT IN ('','.') 2699 THEN table_vcf.INFO 2700 ELSE '' 2701 END 2702 FROM {table_vcf} as table_vcf 2703 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2704 AND table_vcf.\"POS\" = table_variants.\"POS\" 2705 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2706 AND table_vcf.\"REF\" = table_variants.\"REF\" 2707 ) 2708 """ 2709 self.conn.execute(sql_query_update) 2710 2711 # Drop temporary table 2712 sql_drop = f"DROP TABLE {table_vcf}" 2713 self.conn.execute(sql_drop)
It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table
Parameters
- vcf_file: The path to the VCF file you want to update the database with
2715 def drop_variants_table(self) -> None: 2716 """ 2717 > This function drops the variants table 2718 """ 2719 2720 table_variants = self.get_table_variants() 2721 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2722 self.conn.execute(sql_table_variants)
This function drops the variants table
2724 def set_variant_id( 2725 self, variant_id_column: str = "variant_id", force: bool = None 2726 ) -> str: 2727 """ 2728 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2729 `#CHROM`, `POS`, `REF`, and `ALT` columns 2730 2731 :param variant_id_column: The name of the column to be created in the variants table, defaults 2732 to variant_id 2733 :type variant_id_column: str (optional) 2734 :param force: If True, the variant_id column will be created even if it already exists 2735 :type force: bool 2736 :return: The name of the column that contains the variant_id 2737 """ 2738 2739 # Assembly 2740 assembly = self.get_param().get( 2741 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2742 ) 2743 2744 # INFO/Tag prefix 2745 prefix = self.get_explode_infos_prefix() 2746 2747 # Explode INFO/SVTYPE 2748 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2749 2750 # variants table 2751 table_variants = self.get_table_variants() 2752 2753 # variant_id column 2754 if not variant_id_column: 2755 variant_id_column = "variant_id" 2756 2757 # Creta variant_id column 2758 if "variant_id" not in self.get_extra_infos() or force: 2759 2760 # Create column 2761 self.add_column( 2762 table_name=table_variants, 2763 column_name=variant_id_column, 2764 column_type="UBIGINT", 2765 default_value="0", 2766 ) 2767 2768 # Update column 2769 self.conn.execute( 2770 f""" 2771 UPDATE {table_variants} 2772 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2773 """ 2774 ) 2775 2776 # Remove added columns 2777 for added_column in added_columns: 2778 self.drop_column(column=added_column) 2779 2780 # return variant_id column name 2781 return variant_id_column
It adds a column to the variants table called variant_id and populates it with a hash of the
#CHROM, POS, REF, and ALT columns
Parameters
- variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
- force: If True, the variant_id column will be created even if it already exists
Returns
The name of the column that contains the variant_id
2783 def get_variant_id_column( 2784 self, variant_id_column: str = "variant_id", force: bool = None 2785 ) -> str: 2786 """ 2787 This function returns the variant_id column name 2788 2789 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2790 defaults to variant_id 2791 :type variant_id_column: str (optional) 2792 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2793 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2794 if it is not already set, or if it is set 2795 :type force: bool 2796 :return: The variant_id column name. 2797 """ 2798 2799 return self.set_variant_id(variant_id_column=variant_id_column, force=force)
This function returns the variant_id column name
Parameters
- variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
- force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns
The variant_id column name.
2805 def scan_databases( 2806 self, 2807 database_formats: list = ["parquet"], 2808 database_releases: list = ["current"], 2809 ) -> dict: 2810 """ 2811 The function `scan_databases` scans for available databases based on specified formats and 2812 releases. 2813 2814 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2815 of the databases to be scanned. In this case, the accepted format is "parquet" 2816 :type database_formats: list ["parquet"] 2817 :param database_releases: The `database_releases` parameter is a list that specifies the 2818 releases of the databases to be scanned. In the provided function, the default value for 2819 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2820 databases that are in the "current" 2821 :type database_releases: list 2822 :return: The function `scan_databases` returns a dictionary containing information about 2823 databases that match the specified formats and releases. 2824 """ 2825 2826 # Config 2827 config = self.get_config() 2828 2829 # Param 2830 param = self.get_param() 2831 2832 # Param - Assembly 2833 assembly = param.get("assembly", config.get("assembly", None)) 2834 if not assembly: 2835 assembly = DEFAULT_ASSEMBLY 2836 log.warning(f"Default assembly '{assembly}'") 2837 2838 # Scan for availabled databases 2839 log.info( 2840 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2841 ) 2842 databases_infos_dict = databases_infos( 2843 database_folder_releases=database_releases, 2844 database_formats=database_formats, 2845 assembly=assembly, 2846 config=config, 2847 ) 2848 log.info( 2849 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2850 ) 2851 2852 return databases_infos_dict
The function scan_databases scans for available databases based on specified formats and
releases.
Parameters
- database_formats: The
database_formatsparameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet" - database_releases: The
database_releasesparameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value fordatabase_releasesis set to["current"], meaning that by default, the function will scan databases that are in the "current"
Returns
The function
scan_databasesreturns a dictionary containing information about databases that match the specified formats and releases.
2854 def annotation(self) -> None: 2855 """ 2856 It annotates the VCF file with the annotations specified in the config file. 2857 """ 2858 2859 # Config 2860 config = self.get_config() 2861 2862 # Param 2863 param = self.get_param() 2864 2865 # Param - Assembly 2866 assembly = param.get("assembly", config.get("assembly", None)) 2867 if not assembly: 2868 assembly = DEFAULT_ASSEMBLY 2869 log.warning(f"Default assembly '{assembly}'") 2870 2871 # annotations databases folders 2872 annotations_databases = set( 2873 config.get("folders", {}) 2874 .get("databases", {}) 2875 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2876 + config.get("folders", {}) 2877 .get("databases", {}) 2878 .get("parquet", ["~/howard/databases/parquet/current"]) 2879 + config.get("folders", {}) 2880 .get("databases", {}) 2881 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2882 ) 2883 2884 # Get param annotations 2885 if param.get("annotations", None) and isinstance( 2886 param.get("annotations", None), str 2887 ): 2888 log.debug(param.get("annotations", None)) 2889 param_annotation_list = param.get("annotations").split(",") 2890 else: 2891 param_annotation_list = [] 2892 2893 # Each tools param 2894 if param.get("annotation_parquet", None) != None: 2895 log.debug( 2896 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2897 ) 2898 if isinstance(param.get("annotation_parquet", None), list): 2899 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2900 else: 2901 param_annotation_list.append(param.get("annotation_parquet")) 2902 if param.get("annotation_snpsift", None) != None: 2903 if isinstance(param.get("annotation_snpsift", None), list): 2904 param_annotation_list.append( 2905 "snpsift:" 2906 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2907 ) 2908 else: 2909 param_annotation_list.append( 2910 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2911 ) 2912 if param.get("annotation_snpeff", None) != None: 2913 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2914 if param.get("annotation_bcftools", None) != None: 2915 if isinstance(param.get("annotation_bcftools", None), list): 2916 param_annotation_list.append( 2917 "bcftools:" 2918 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2919 ) 2920 else: 2921 param_annotation_list.append( 2922 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2923 ) 2924 if param.get("annotation_annovar", None) != None: 2925 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2926 if param.get("annotation_exomiser", None) != None: 2927 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2928 if param.get("annotation_splice", None) != None: 2929 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2930 2931 # Merge param annotations list 2932 param["annotations"] = ",".join(param_annotation_list) 2933 2934 # debug 2935 log.debug(f"param_annotations={param['annotations']}") 2936 2937 if param.get("annotations"): 2938 2939 # Log 2940 # log.info("Annotations - Check annotation parameters") 2941 2942 if not "annotation" in param: 2943 param["annotation"] = {} 2944 2945 # List of annotations parameters 2946 annotations_list_input = {} 2947 if isinstance(param.get("annotations", None), str): 2948 annotation_file_list = [ 2949 value for value in param.get("annotations", "").split(",") 2950 ] 2951 for annotation_file in annotation_file_list: 2952 annotations_list_input[annotation_file] = {"INFO": None} 2953 else: 2954 annotations_list_input = param.get("annotations", {}) 2955 2956 log.info(f"Quick Annotations:") 2957 for annotation_key in list(annotations_list_input.keys()): 2958 log.info(f" {annotation_key}") 2959 2960 # List of annotations and associated fields 2961 annotations_list = {} 2962 2963 for annotation_file in annotations_list_input: 2964 2965 # Explode annotations if ALL 2966 if ( 2967 annotation_file.upper() == "ALL" 2968 or annotation_file.upper().startswith("ALL:") 2969 ): 2970 2971 # check ALL parameters (formats, releases) 2972 annotation_file_split = annotation_file.split(":") 2973 database_formats = "parquet" 2974 database_releases = "current" 2975 for annotation_file_option in annotation_file_split[1:]: 2976 database_all_options_split = annotation_file_option.split("=") 2977 if database_all_options_split[0] == "format": 2978 database_formats = database_all_options_split[1].split("+") 2979 if database_all_options_split[0] == "release": 2980 database_releases = database_all_options_split[1].split("+") 2981 2982 # Scan for availabled databases 2983 databases_infos_dict = self.scan_databases( 2984 database_formats=database_formats, 2985 database_releases=database_releases, 2986 ) 2987 2988 # Add found databases in annotation parameters 2989 for database_infos in databases_infos_dict.keys(): 2990 annotations_list[database_infos] = {"INFO": None} 2991 2992 else: 2993 annotations_list[annotation_file] = annotations_list_input[ 2994 annotation_file 2995 ] 2996 2997 # Check each databases 2998 if len(annotations_list): 2999 3000 log.info( 3001 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3002 ) 3003 3004 for annotation_file in annotations_list: 3005 3006 # Init 3007 annotations = annotations_list.get(annotation_file, None) 3008 3009 # Annotation snpEff 3010 if annotation_file.startswith("snpeff"): 3011 3012 log.debug(f"Quick Annotation snpEff") 3013 3014 if "snpeff" not in param["annotation"]: 3015 param["annotation"]["snpeff"] = {} 3016 3017 if "options" not in param["annotation"]["snpeff"]: 3018 param["annotation"]["snpeff"]["options"] = "" 3019 3020 # snpEff options in annotations 3021 param["annotation"]["snpeff"]["options"] = "".join( 3022 annotation_file.split(":")[1:] 3023 ) 3024 3025 # Annotation Annovar 3026 elif annotation_file.startswith("annovar"): 3027 3028 log.debug(f"Quick Annotation Annovar") 3029 3030 if "annovar" not in param["annotation"]: 3031 param["annotation"]["annovar"] = {} 3032 3033 if "annotations" not in param["annotation"]["annovar"]: 3034 param["annotation"]["annovar"]["annotations"] = {} 3035 3036 # Options 3037 annotation_file_split = annotation_file.split(":") 3038 for annotation_file_annotation in annotation_file_split[1:]: 3039 if annotation_file_annotation: 3040 param["annotation"]["annovar"]["annotations"][ 3041 annotation_file_annotation 3042 ] = annotations 3043 3044 # Annotation Exomiser 3045 elif annotation_file.startswith("exomiser"): 3046 3047 log.debug(f"Quick Annotation Exomiser") 3048 3049 param["annotation"]["exomiser"] = params_string_to_dict( 3050 annotation_file 3051 ) 3052 3053 # Annotation Splice 3054 elif annotation_file.startswith("splice"): 3055 3056 log.debug(f"Quick Annotation Splice") 3057 3058 param["annotation"]["splice"] = params_string_to_dict( 3059 annotation_file 3060 ) 3061 3062 # Annotation Parquet or BCFTOOLS 3063 else: 3064 3065 # Tools detection 3066 if annotation_file.startswith("bcftools:"): 3067 annotation_tool_initial = "bcftools" 3068 annotation_file = ":".join(annotation_file.split(":")[1:]) 3069 elif annotation_file.startswith("snpsift:"): 3070 annotation_tool_initial = "snpsift" 3071 annotation_file = ":".join(annotation_file.split(":")[1:]) 3072 else: 3073 annotation_tool_initial = None 3074 3075 # list of files 3076 annotation_file_list = annotation_file.replace("+", ":").split( 3077 ":" 3078 ) 3079 3080 for annotation_file in annotation_file_list: 3081 3082 if annotation_file: 3083 3084 # Annotation tool initial 3085 annotation_tool = annotation_tool_initial 3086 3087 # Find file 3088 annotation_file_found = None 3089 3090 # Expand user 3091 annotation_file = full_path(annotation_file) 3092 3093 if os.path.exists(annotation_file): 3094 annotation_file_found = annotation_file 3095 3096 else: 3097 # Find within assembly folders 3098 for annotations_database in annotations_databases: 3099 found_files = find_all( 3100 annotation_file, 3101 os.path.join( 3102 annotations_database, assembly 3103 ), 3104 ) 3105 if len(found_files) > 0: 3106 annotation_file_found = found_files[0] 3107 break 3108 if not annotation_file_found and not assembly: 3109 # Find within folders 3110 for ( 3111 annotations_database 3112 ) in annotations_databases: 3113 found_files = find_all( 3114 annotation_file, annotations_database 3115 ) 3116 if len(found_files) > 0: 3117 annotation_file_found = found_files[0] 3118 break 3119 log.debug( 3120 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3121 ) 3122 3123 # Full path 3124 annotation_file_found = full_path(annotation_file_found) 3125 3126 if annotation_file_found: 3127 3128 database = Database(database=annotation_file_found) 3129 quick_annotation_format = database.get_format() 3130 quick_annotation_is_compressed = ( 3131 database.is_compressed() 3132 ) 3133 quick_annotation_is_indexed = os.path.exists( 3134 f"{annotation_file_found}.tbi" 3135 ) 3136 bcftools_preference = False 3137 3138 # Check Annotation Tool 3139 if not annotation_tool: 3140 if ( 3141 bcftools_preference 3142 and quick_annotation_format 3143 in ["vcf", "bed"] 3144 and quick_annotation_is_compressed 3145 and quick_annotation_is_indexed 3146 ): 3147 annotation_tool = "bcftools" 3148 elif quick_annotation_format in [ 3149 "vcf", 3150 "bed", 3151 "tsv", 3152 "tsv", 3153 "csv", 3154 "json", 3155 "tbl", 3156 "parquet", 3157 "duckdb", 3158 ]: 3159 annotation_tool = "parquet" 3160 else: 3161 log.error( 3162 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3163 ) 3164 raise ValueError( 3165 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3166 ) 3167 3168 log.debug( 3169 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3170 ) 3171 3172 # Annotation Tool dispatch 3173 if annotation_tool: 3174 if annotation_tool not in param["annotation"]: 3175 param["annotation"][annotation_tool] = {} 3176 if ( 3177 "annotations" 3178 not in param["annotation"][annotation_tool] 3179 ): 3180 param["annotation"][annotation_tool][ 3181 "annotations" 3182 ] = {} 3183 param["annotation"][annotation_tool][ 3184 "annotations" 3185 ][annotation_file_found] = annotations 3186 3187 else: 3188 log.error( 3189 f"Quick Annotation File {annotation_file} does NOT exist" 3190 ) 3191 3192 self.set_param(param) 3193 3194 if param.get("annotation", None): 3195 log.info("Annotations") 3196 if param.get("annotation", {}).get("parquet", None): 3197 log.info("Annotations 'parquet'...") 3198 self.annotation_parquet() 3199 if param.get("annotation", {}).get("bcftools", None): 3200 log.info("Annotations 'bcftools'...") 3201 self.annotation_bcftools() 3202 if param.get("annotation", {}).get("snpsift", None): 3203 log.info("Annotations 'snpsift'...") 3204 self.annotation_snpsift() 3205 if param.get("annotation", {}).get("annovar", None): 3206 log.info("Annotations 'annovar'...") 3207 self.annotation_annovar() 3208 if param.get("annotation", {}).get("snpeff", None): 3209 log.info("Annotations 'snpeff'...") 3210 self.annotation_snpeff() 3211 if param.get("annotation", {}).get("exomiser", None) is not None: 3212 log.info("Annotations 'exomiser'...") 3213 self.annotation_exomiser() 3214 if param.get("annotation", {}).get("splice", None) is not None: 3215 log.info("Annotations 'splice' ...") 3216 self.annotation_splice() 3217 3218 # Explode INFOS fields into table fields 3219 if self.get_explode_infos(): 3220 self.explode_infos( 3221 prefix=self.get_explode_infos_prefix(), 3222 fields=self.get_explode_infos_fields(), 3223 force=True, 3224 )
It annotates the VCF file with the annotations specified in the config file.
3226 def annotation_snpsift(self, threads: int = None) -> None: 3227 """ 3228 This function annotate with bcftools 3229 3230 :param threads: Number of threads to use 3231 :return: the value of the variable "return_value". 3232 """ 3233 3234 # DEBUG 3235 log.debug("Start annotation with bcftools databases") 3236 3237 # Threads 3238 if not threads: 3239 threads = self.get_threads() 3240 log.debug("Threads: " + str(threads)) 3241 3242 # Config 3243 config = self.get_config() 3244 log.debug("Config: " + str(config)) 3245 3246 # Config - snpSift 3247 snpsift_bin_command = get_bin_command( 3248 bin="SnpSift.jar", 3249 tool="snpsift", 3250 bin_type="jar", 3251 config=config, 3252 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3253 ) 3254 if not snpsift_bin_command: 3255 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3256 log.error(msg_err) 3257 raise ValueError(msg_err) 3258 3259 # Config - bcftools 3260 bcftools_bin_command = get_bin_command( 3261 bin="bcftools", 3262 tool="bcftools", 3263 bin_type="bin", 3264 config=config, 3265 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3266 ) 3267 if not bcftools_bin_command: 3268 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3269 log.error(msg_err) 3270 raise ValueError(msg_err) 3271 3272 # Config - BCFTools databases folders 3273 databases_folders = set( 3274 self.get_config() 3275 .get("folders", {}) 3276 .get("databases", {}) 3277 .get("annotations", ["."]) 3278 + self.get_config() 3279 .get("folders", {}) 3280 .get("databases", {}) 3281 .get("bcftools", ["."]) 3282 ) 3283 log.debug("Databases annotations: " + str(databases_folders)) 3284 3285 # Param 3286 annotations = ( 3287 self.get_param() 3288 .get("annotation", {}) 3289 .get("snpsift", {}) 3290 .get("annotations", None) 3291 ) 3292 log.debug("Annotations: " + str(annotations)) 3293 3294 # Assembly 3295 assembly = self.get_param().get( 3296 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3297 ) 3298 3299 # Data 3300 table_variants = self.get_table_variants() 3301 3302 # Check if not empty 3303 log.debug("Check if not empty") 3304 sql_query_chromosomes = ( 3305 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3306 ) 3307 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3308 if not sql_query_chromosomes_df["count"][0]: 3309 log.info(f"VCF empty") 3310 return 3311 3312 # VCF header 3313 vcf_reader = self.get_header() 3314 log.debug("Initial header: " + str(vcf_reader.infos)) 3315 3316 # Existing annotations 3317 for vcf_annotation in self.get_header().infos: 3318 3319 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3320 log.debug( 3321 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3322 ) 3323 3324 if annotations: 3325 3326 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3327 3328 # Export VCF file 3329 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3330 3331 # Init 3332 commands = {} 3333 3334 for annotation in annotations: 3335 annotation_fields = annotations[annotation] 3336 3337 # Annotation Name 3338 annotation_name = os.path.basename(annotation) 3339 3340 if not annotation_fields: 3341 annotation_fields = {"INFO": None} 3342 3343 log.debug(f"Annotation '{annotation_name}'") 3344 log.debug( 3345 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3346 ) 3347 3348 # Create Database 3349 database = Database( 3350 database=annotation, 3351 databases_folders=databases_folders, 3352 assembly=assembly, 3353 ) 3354 3355 # Find files 3356 db_file = database.get_database() 3357 db_file = full_path(db_file) 3358 db_hdr_file = database.get_header_file() 3359 db_hdr_file = full_path(db_hdr_file) 3360 db_file_type = database.get_format() 3361 db_tbi_file = f"{db_file}.tbi" 3362 db_file_compressed = database.is_compressed() 3363 3364 # Check if compressed 3365 if not db_file_compressed: 3366 log.error( 3367 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3368 ) 3369 raise ValueError( 3370 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3371 ) 3372 3373 # Check if indexed 3374 if not os.path.exists(db_tbi_file): 3375 log.error( 3376 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3377 ) 3378 raise ValueError( 3379 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3380 ) 3381 3382 # Check index - try to create if not exists 3383 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3384 log.error("Annotation failed: database not valid") 3385 log.error(f"Annotation annotation file: {db_file}") 3386 log.error(f"Annotation annotation header: {db_hdr_file}") 3387 log.error(f"Annotation annotation index: {db_tbi_file}") 3388 raise ValueError( 3389 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3390 ) 3391 else: 3392 3393 log.debug( 3394 f"Annotation '{annotation}' - file: " 3395 + str(db_file) 3396 + " and " 3397 + str(db_hdr_file) 3398 ) 3399 3400 # Load header as VCF object 3401 db_hdr_vcf = Variants(input=db_hdr_file) 3402 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3403 log.debug( 3404 "Annotation database header: " 3405 + str(db_hdr_vcf_header_infos) 3406 ) 3407 3408 # For all fields in database 3409 annotation_fields_full = False 3410 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3411 annotation_fields = { 3412 key: key for key in db_hdr_vcf_header_infos 3413 } 3414 log.debug( 3415 "Annotation database header - All annotations added: " 3416 + str(annotation_fields) 3417 ) 3418 annotation_fields_full = True 3419 3420 # # Create file for field rename 3421 # log.debug("Create file for field rename") 3422 # tmp_rename = NamedTemporaryFile( 3423 # prefix=self.get_prefix(), 3424 # dir=self.get_tmp_dir(), 3425 # suffix=".rename", 3426 # delete=False, 3427 # ) 3428 # tmp_rename_name = tmp_rename.name 3429 # tmp_files.append(tmp_rename_name) 3430 3431 # Number of fields 3432 nb_annotation_field = 0 3433 annotation_list = [] 3434 annotation_infos_rename_list = [] 3435 3436 for annotation_field in annotation_fields: 3437 3438 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3439 annotation_fields_new_name = annotation_fields.get( 3440 annotation_field, annotation_field 3441 ) 3442 if not annotation_fields_new_name: 3443 annotation_fields_new_name = annotation_field 3444 3445 # Check if field is in DB and if field is not elready in input data 3446 if ( 3447 annotation_field in db_hdr_vcf.get_header().infos 3448 and annotation_fields_new_name 3449 not in self.get_header().infos 3450 ): 3451 3452 log.info( 3453 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3454 ) 3455 3456 # BCFTools annotate param to rename fields 3457 if annotation_field != annotation_fields_new_name: 3458 annotation_infos_rename_list.append( 3459 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3460 ) 3461 3462 # Add INFO field to header 3463 db_hdr_vcf_header_infos_number = ( 3464 db_hdr_vcf_header_infos[annotation_field].num or "." 3465 ) 3466 db_hdr_vcf_header_infos_type = ( 3467 db_hdr_vcf_header_infos[annotation_field].type 3468 or "String" 3469 ) 3470 db_hdr_vcf_header_infos_description = ( 3471 db_hdr_vcf_header_infos[annotation_field].desc 3472 or f"{annotation_field} description" 3473 ) 3474 db_hdr_vcf_header_infos_source = ( 3475 db_hdr_vcf_header_infos[annotation_field].source 3476 or "unknown" 3477 ) 3478 db_hdr_vcf_header_infos_version = ( 3479 db_hdr_vcf_header_infos[annotation_field].version 3480 or "unknown" 3481 ) 3482 3483 vcf_reader.infos[annotation_fields_new_name] = ( 3484 vcf.parser._Info( 3485 annotation_fields_new_name, 3486 db_hdr_vcf_header_infos_number, 3487 db_hdr_vcf_header_infos_type, 3488 db_hdr_vcf_header_infos_description, 3489 db_hdr_vcf_header_infos_source, 3490 db_hdr_vcf_header_infos_version, 3491 self.code_type_map[ 3492 db_hdr_vcf_header_infos_type 3493 ], 3494 ) 3495 ) 3496 3497 annotation_list.append(annotation_field) 3498 3499 nb_annotation_field += 1 3500 3501 else: 3502 3503 if ( 3504 annotation_field 3505 not in db_hdr_vcf.get_header().infos 3506 ): 3507 log.warning( 3508 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3509 ) 3510 if ( 3511 annotation_fields_new_name 3512 in self.get_header().infos 3513 ): 3514 log.warning( 3515 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3516 ) 3517 3518 log.info( 3519 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3520 ) 3521 3522 annotation_infos = ",".join(annotation_list) 3523 3524 if annotation_infos != "": 3525 3526 # Annotated VCF (and error file) 3527 tmp_annotation_vcf_name = os.path.join( 3528 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3529 ) 3530 tmp_annotation_vcf_name_err = ( 3531 tmp_annotation_vcf_name + ".err" 3532 ) 3533 3534 # Add fields to annotate 3535 if not annotation_fields_full: 3536 annotation_infos_option = f"-info {annotation_infos}" 3537 else: 3538 annotation_infos_option = "" 3539 3540 # Info fields rename 3541 if annotation_infos_rename_list: 3542 annotation_infos_rename = " -c " + ",".join( 3543 annotation_infos_rename_list 3544 ) 3545 else: 3546 annotation_infos_rename = "" 3547 3548 # Annotate command 3549 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3550 3551 # Add command 3552 commands[command_annotate] = tmp_annotation_vcf_name 3553 3554 if commands: 3555 3556 # Export VCF file 3557 self.export_variant_vcf( 3558 vcf_file=tmp_vcf_name, 3559 remove_info=True, 3560 add_samples=False, 3561 index=True, 3562 ) 3563 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3564 3565 # Num command 3566 nb_command = 0 3567 3568 # Annotate 3569 for command_annotate in commands: 3570 nb_command += 1 3571 log.info( 3572 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3573 ) 3574 log.debug(f"command_annotate={command_annotate}") 3575 run_parallel_commands([command_annotate], threads) 3576 3577 # Debug 3578 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3579 3580 # Update variants 3581 log.info( 3582 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3583 ) 3584 self.update_from_vcf(commands[command_annotate])
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3586 def annotation_bcftools(self, threads: int = None) -> None: 3587 """ 3588 This function annotate with bcftools 3589 3590 :param threads: Number of threads to use 3591 :return: the value of the variable "return_value". 3592 """ 3593 3594 # DEBUG 3595 log.debug("Start annotation with bcftools databases") 3596 3597 # Threads 3598 if not threads: 3599 threads = self.get_threads() 3600 log.debug("Threads: " + str(threads)) 3601 3602 # Config 3603 config = self.get_config() 3604 log.debug("Config: " + str(config)) 3605 3606 # DEBUG 3607 delete_tmp = True 3608 if self.get_config().get("verbosity", "warning") in ["debug"]: 3609 delete_tmp = False 3610 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3611 3612 # Config - BCFTools bin command 3613 bcftools_bin_command = get_bin_command( 3614 bin="bcftools", 3615 tool="bcftools", 3616 bin_type="bin", 3617 config=config, 3618 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3619 ) 3620 if not bcftools_bin_command: 3621 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3622 log.error(msg_err) 3623 raise ValueError(msg_err) 3624 3625 # Config - BCFTools databases folders 3626 databases_folders = set( 3627 self.get_config() 3628 .get("folders", {}) 3629 .get("databases", {}) 3630 .get("annotations", ["."]) 3631 + self.get_config() 3632 .get("folders", {}) 3633 .get("databases", {}) 3634 .get("bcftools", ["."]) 3635 ) 3636 log.debug("Databases annotations: " + str(databases_folders)) 3637 3638 # Param 3639 annotations = ( 3640 self.get_param() 3641 .get("annotation", {}) 3642 .get("bcftools", {}) 3643 .get("annotations", None) 3644 ) 3645 log.debug("Annotations: " + str(annotations)) 3646 3647 # Assembly 3648 assembly = self.get_param().get( 3649 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3650 ) 3651 3652 # Data 3653 table_variants = self.get_table_variants() 3654 3655 # Check if not empty 3656 log.debug("Check if not empty") 3657 sql_query_chromosomes = ( 3658 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3659 ) 3660 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3661 if not sql_query_chromosomes_df["count"][0]: 3662 log.info(f"VCF empty") 3663 return 3664 3665 # Export in VCF 3666 log.debug("Create initial file to annotate") 3667 tmp_vcf = NamedTemporaryFile( 3668 prefix=self.get_prefix(), 3669 dir=self.get_tmp_dir(), 3670 suffix=".vcf.gz", 3671 delete=False, 3672 ) 3673 tmp_vcf_name = tmp_vcf.name 3674 3675 # VCF header 3676 vcf_reader = self.get_header() 3677 log.debug("Initial header: " + str(vcf_reader.infos)) 3678 3679 # Existing annotations 3680 for vcf_annotation in self.get_header().infos: 3681 3682 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3683 log.debug( 3684 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3685 ) 3686 3687 if annotations: 3688 3689 tmp_ann_vcf_list = [] 3690 commands = [] 3691 tmp_files = [] 3692 err_files = [] 3693 3694 for annotation in annotations: 3695 annotation_fields = annotations[annotation] 3696 3697 # Annotation Name 3698 annotation_name = os.path.basename(annotation) 3699 3700 if not annotation_fields: 3701 annotation_fields = {"INFO": None} 3702 3703 log.debug(f"Annotation '{annotation_name}'") 3704 log.debug( 3705 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3706 ) 3707 3708 # Create Database 3709 database = Database( 3710 database=annotation, 3711 databases_folders=databases_folders, 3712 assembly=assembly, 3713 ) 3714 3715 # Find files 3716 db_file = database.get_database() 3717 db_file = full_path(db_file) 3718 db_hdr_file = database.get_header_file() 3719 db_hdr_file = full_path(db_hdr_file) 3720 db_file_type = database.get_format() 3721 db_tbi_file = f"{db_file}.tbi" 3722 db_file_compressed = database.is_compressed() 3723 3724 # Check if compressed 3725 if not db_file_compressed: 3726 log.error( 3727 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3728 ) 3729 raise ValueError( 3730 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3731 ) 3732 3733 # Check if indexed 3734 if not os.path.exists(db_tbi_file): 3735 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3736 raise ValueError( 3737 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3738 ) 3739 3740 # Check index - try to create if not exists 3741 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3742 log.error("Annotation failed: database not valid") 3743 log.error(f"Annotation annotation file: {db_file}") 3744 log.error(f"Annotation annotation header: {db_hdr_file}") 3745 log.error(f"Annotation annotation index: {db_tbi_file}") 3746 raise ValueError( 3747 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3748 ) 3749 else: 3750 3751 log.debug( 3752 f"Annotation '{annotation}' - file: " 3753 + str(db_file) 3754 + " and " 3755 + str(db_hdr_file) 3756 ) 3757 3758 # Load header as VCF object 3759 db_hdr_vcf = Variants(input=db_hdr_file) 3760 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3761 log.debug( 3762 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3763 ) 3764 3765 # For all fields in database 3766 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3767 annotation_fields = { 3768 key: key for key in db_hdr_vcf_header_infos 3769 } 3770 log.debug( 3771 "Annotation database header - All annotations added: " 3772 + str(annotation_fields) 3773 ) 3774 3775 # Number of fields 3776 nb_annotation_field = 0 3777 annotation_list = [] 3778 3779 for annotation_field in annotation_fields: 3780 3781 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3782 annotation_fields_new_name = annotation_fields.get( 3783 annotation_field, annotation_field 3784 ) 3785 if not annotation_fields_new_name: 3786 annotation_fields_new_name = annotation_field 3787 3788 # Check if field is in DB and if field is not elready in input data 3789 if ( 3790 annotation_field in db_hdr_vcf.get_header().infos 3791 and annotation_fields_new_name 3792 not in self.get_header().infos 3793 ): 3794 3795 log.info( 3796 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3797 ) 3798 3799 # Add INFO field to header 3800 db_hdr_vcf_header_infos_number = ( 3801 db_hdr_vcf_header_infos[annotation_field].num or "." 3802 ) 3803 db_hdr_vcf_header_infos_type = ( 3804 db_hdr_vcf_header_infos[annotation_field].type 3805 or "String" 3806 ) 3807 db_hdr_vcf_header_infos_description = ( 3808 db_hdr_vcf_header_infos[annotation_field].desc 3809 or f"{annotation_field} description" 3810 ) 3811 db_hdr_vcf_header_infos_source = ( 3812 db_hdr_vcf_header_infos[annotation_field].source 3813 or "unknown" 3814 ) 3815 db_hdr_vcf_header_infos_version = ( 3816 db_hdr_vcf_header_infos[annotation_field].version 3817 or "unknown" 3818 ) 3819 3820 vcf_reader.infos[annotation_fields_new_name] = ( 3821 vcf.parser._Info( 3822 annotation_fields_new_name, 3823 db_hdr_vcf_header_infos_number, 3824 db_hdr_vcf_header_infos_type, 3825 db_hdr_vcf_header_infos_description, 3826 db_hdr_vcf_header_infos_source, 3827 db_hdr_vcf_header_infos_version, 3828 self.code_type_map[db_hdr_vcf_header_infos_type], 3829 ) 3830 ) 3831 3832 # annotation_list.append(annotation_field) 3833 if annotation_field != annotation_fields_new_name: 3834 annotation_list.append( 3835 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3836 ) 3837 else: 3838 annotation_list.append(annotation_field) 3839 3840 nb_annotation_field += 1 3841 3842 else: 3843 3844 if annotation_field not in db_hdr_vcf.get_header().infos: 3845 log.warning( 3846 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3847 ) 3848 if annotation_fields_new_name in self.get_header().infos: 3849 log.warning( 3850 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3851 ) 3852 3853 log.info( 3854 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3855 ) 3856 3857 annotation_infos = ",".join(annotation_list) 3858 3859 if annotation_infos != "": 3860 3861 # Protect header for bcftools (remove "#CHROM" and variants line) 3862 log.debug("Protect Header file - remove #CHROM line if exists") 3863 tmp_header_vcf = NamedTemporaryFile( 3864 prefix=self.get_prefix(), 3865 dir=self.get_tmp_dir(), 3866 suffix=".hdr", 3867 delete=False, 3868 ) 3869 tmp_header_vcf_name = tmp_header_vcf.name 3870 tmp_files.append(tmp_header_vcf_name) 3871 # Command 3872 if db_hdr_file.endswith(".gz"): 3873 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3874 else: 3875 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3876 # Run 3877 run_parallel_commands([command_extract_header], 1) 3878 3879 # Find chomosomes 3880 log.debug("Find chromosomes ") 3881 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3882 sql_query_chromosomes_df = self.get_query_to_df( 3883 sql_query_chromosomes 3884 ) 3885 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3886 3887 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3888 3889 # BED columns in the annotation file 3890 if db_file_type in ["bed"]: 3891 annotation_infos = "CHROM,POS,POS," + annotation_infos 3892 3893 for chrom in chomosomes_list: 3894 3895 # Create BED on initial VCF 3896 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3897 tmp_bed = NamedTemporaryFile( 3898 prefix=self.get_prefix(), 3899 dir=self.get_tmp_dir(), 3900 suffix=".bed", 3901 delete=False, 3902 ) 3903 tmp_bed_name = tmp_bed.name 3904 tmp_files.append(tmp_bed_name) 3905 3906 # Detecte regions 3907 log.debug( 3908 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3909 ) 3910 window = 1000000 3911 sql_query_intervals_for_bed = f""" 3912 SELECT \"#CHROM\", 3913 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3914 \"POS\"+{window} 3915 FROM {table_variants} as table_variants 3916 WHERE table_variants.\"#CHROM\" = '{chrom}' 3917 """ 3918 regions = self.conn.execute( 3919 sql_query_intervals_for_bed 3920 ).fetchall() 3921 merged_regions = merge_regions(regions) 3922 log.debug( 3923 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3924 ) 3925 3926 header = ["#CHROM", "START", "END"] 3927 with open(tmp_bed_name, "w") as f: 3928 # Write the header with tab delimiter 3929 f.write("\t".join(header) + "\n") 3930 for d in merged_regions: 3931 # Write each data row with tab delimiter 3932 f.write("\t".join(map(str, d)) + "\n") 3933 3934 # Tmp files 3935 tmp_annotation_vcf = NamedTemporaryFile( 3936 prefix=self.get_prefix(), 3937 dir=self.get_tmp_dir(), 3938 suffix=".vcf.gz", 3939 delete=False, 3940 ) 3941 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3942 tmp_files.append(tmp_annotation_vcf_name) 3943 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3944 tmp_annotation_vcf_name_err = ( 3945 tmp_annotation_vcf_name + ".err" 3946 ) 3947 err_files.append(tmp_annotation_vcf_name_err) 3948 3949 # Annotate Command 3950 log.debug( 3951 f"Annotation '{annotation}' - add bcftools command" 3952 ) 3953 3954 # Command 3955 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3956 3957 # Add command 3958 commands.append(command_annotate) 3959 3960 # if some commands 3961 if commands: 3962 3963 # Export VCF file 3964 self.export_variant_vcf( 3965 vcf_file=tmp_vcf_name, 3966 remove_info=True, 3967 add_samples=False, 3968 index=True, 3969 ) 3970 3971 # Threads 3972 # calculate threads for annotated commands 3973 if commands: 3974 threads_bcftools_annotate = round(threads / len(commands)) 3975 else: 3976 threads_bcftools_annotate = 1 3977 3978 if not threads_bcftools_annotate: 3979 threads_bcftools_annotate = 1 3980 3981 # Add threads option to bcftools commands 3982 if threads_bcftools_annotate > 1: 3983 commands_threaded = [] 3984 for command in commands: 3985 commands_threaded.append( 3986 command.replace( 3987 f"{bcftools_bin_command} annotate ", 3988 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3989 ) 3990 ) 3991 commands = commands_threaded 3992 3993 # Command annotation multithreading 3994 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3995 log.info( 3996 f"Annotation - Annotation multithreaded in " 3997 + str(len(commands)) 3998 + " commands" 3999 ) 4000 4001 run_parallel_commands(commands, threads) 4002 4003 # Merge 4004 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4005 4006 if tmp_ann_vcf_list_cmd: 4007 4008 # Tmp file 4009 tmp_annotate_vcf = NamedTemporaryFile( 4010 prefix=self.get_prefix(), 4011 dir=self.get_tmp_dir(), 4012 suffix=".vcf.gz", 4013 delete=True, 4014 ) 4015 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4016 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4017 err_files.append(tmp_annotate_vcf_name_err) 4018 4019 # Tmp file remove command 4020 tmp_files_remove_command = "" 4021 if tmp_files: 4022 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4023 4024 # Command merge 4025 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4026 log.info( 4027 f"Annotation - Annotation merging " 4028 + str(len(commands)) 4029 + " annotated files" 4030 ) 4031 log.debug(f"Annotation - merge command: {merge_command}") 4032 run_parallel_commands([merge_command], 1) 4033 4034 # Error messages 4035 log.info(f"Error/Warning messages:") 4036 error_message_command_all = [] 4037 error_message_command_warning = [] 4038 error_message_command_err = [] 4039 for err_file in err_files: 4040 with open(err_file, "r") as f: 4041 for line in f: 4042 message = line.strip() 4043 error_message_command_all.append(message) 4044 if line.startswith("[W::"): 4045 error_message_command_warning.append(message) 4046 if line.startswith("[E::"): 4047 error_message_command_err.append( 4048 f"{err_file}: " + message 4049 ) 4050 # log info 4051 for message in list( 4052 set(error_message_command_err + error_message_command_warning) 4053 ): 4054 log.info(f" {message}") 4055 # debug info 4056 for message in list(set(error_message_command_all)): 4057 log.debug(f" {message}") 4058 # failed 4059 if len(error_message_command_err): 4060 log.error("Annotation failed: Error in commands") 4061 raise ValueError("Annotation failed: Error in commands") 4062 4063 # Update variants 4064 log.info(f"Annotation - Updating...") 4065 self.update_from_vcf(tmp_annotate_vcf_name)
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
4067 def annotation_exomiser(self, threads: int = None) -> None: 4068 """ 4069 This function annotate with Exomiser 4070 4071 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4072 - "analysis" (dict/file): 4073 Full analysis dictionnary parameters (see Exomiser docs). 4074 Either a dict, or a file in JSON or YAML format. 4075 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4076 Default : None 4077 - "preset" (string): 4078 Analysis preset (available in config folder). 4079 Used if no full "analysis" is provided. 4080 Default: "exome" 4081 - "phenopacket" (dict/file): 4082 Samples and phenotipic features parameters (see Exomiser docs). 4083 Either a dict, or a file in JSON or YAML format. 4084 Default: None 4085 - "subject" (dict): 4086 Sample parameters (see Exomiser docs). 4087 Example: 4088 "subject": 4089 { 4090 "id": "ISDBM322017", 4091 "sex": "FEMALE" 4092 } 4093 Default: None 4094 - "sample" (string): 4095 Sample name to construct "subject" section: 4096 "subject": 4097 { 4098 "id": "<sample>", 4099 "sex": "UNKNOWN_SEX" 4100 } 4101 Default: None 4102 - "phenotypicFeatures" (dict) 4103 Phenotypic features to construct "subject" section. 4104 Example: 4105 "phenotypicFeatures": 4106 [ 4107 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4108 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4109 ] 4110 - "hpo" (list) 4111 List of HPO ids as phenotypic features. 4112 Example: 4113 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4114 Default: [] 4115 - "outputOptions" (dict): 4116 Output options (see Exomiser docs). 4117 Default: 4118 "output_options" = 4119 { 4120 "outputContributingVariantsOnly": False, 4121 "numGenes": 0, 4122 "outputFormats": ["TSV_VARIANT", "VCF"] 4123 } 4124 - "transcript_source" (string): 4125 Transcript source (either "refseq", "ucsc", "ensembl") 4126 Default: "refseq" 4127 - "exomiser_to_info" (boolean): 4128 Add exomiser TSV file columns as INFO fields in VCF. 4129 Default: False 4130 - "release" (string): 4131 Exomise database release. 4132 If not exists, database release will be downloaded (take a while). 4133 Default: None (provided by application.properties configuration file) 4134 - "exomiser_application_properties" (file): 4135 Exomiser configuration file (see Exomiser docs). 4136 Useful to automatically download databases (especially for specific genome databases). 4137 4138 Notes: 4139 - If no sample in parameters, first sample in VCF will be chosen 4140 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4141 4142 :param threads: The number of threads to use 4143 :return: None. 4144 """ 4145 4146 # DEBUG 4147 log.debug("Start annotation with Exomiser databases") 4148 4149 # Threads 4150 if not threads: 4151 threads = self.get_threads() 4152 log.debug("Threads: " + str(threads)) 4153 4154 # Config 4155 config = self.get_config() 4156 log.debug("Config: " + str(config)) 4157 4158 # Config - Folders - Databases 4159 databases_folders = ( 4160 config.get("folders", {}) 4161 .get("databases", {}) 4162 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4163 ) 4164 databases_folders = full_path(databases_folders) 4165 if not os.path.exists(databases_folders): 4166 log.error(f"Databases annotations: {databases_folders} NOT found") 4167 log.debug("Databases annotations: " + str(databases_folders)) 4168 4169 # Config - Exomiser 4170 exomiser_bin_command = get_bin_command( 4171 bin="exomiser-cli*.jar", 4172 tool="exomiser", 4173 bin_type="jar", 4174 config=config, 4175 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4176 ) 4177 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4178 if not exomiser_bin_command: 4179 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4180 log.error(msg_err) 4181 raise ValueError(msg_err) 4182 4183 # Param 4184 param = self.get_param() 4185 log.debug("Param: " + str(param)) 4186 4187 # Param - Exomiser 4188 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4189 log.debug(f"Param Exomiser: {param_exomiser}") 4190 4191 # Param - Assembly 4192 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4193 log.debug("Assembly: " + str(assembly)) 4194 4195 # Data 4196 table_variants = self.get_table_variants() 4197 4198 # Check if not empty 4199 log.debug("Check if not empty") 4200 sql_query_chromosomes = ( 4201 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4202 ) 4203 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4204 log.info(f"VCF empty") 4205 return False 4206 4207 # VCF header 4208 vcf_reader = self.get_header() 4209 log.debug("Initial header: " + str(vcf_reader.infos)) 4210 4211 # Samples 4212 samples = self.get_header_sample_list() 4213 if not samples: 4214 log.error("No Samples in VCF") 4215 return False 4216 log.debug(f"Samples: {samples}") 4217 4218 # Memory limit 4219 memory_limit = self.get_memory("8G") 4220 log.debug(f"memory_limit: {memory_limit}") 4221 4222 # Exomiser java options 4223 exomiser_java_options = ( 4224 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4225 ) 4226 log.debug(f"Exomiser java options: {exomiser_java_options}") 4227 4228 # Download Exomiser (if not exists) 4229 exomiser_release = param_exomiser.get("release", None) 4230 exomiser_application_properties = param_exomiser.get( 4231 "exomiser_application_properties", None 4232 ) 4233 databases_download_exomiser( 4234 assemblies=[assembly], 4235 exomiser_folder=databases_folders, 4236 exomiser_release=exomiser_release, 4237 exomiser_phenotype_release=exomiser_release, 4238 exomiser_application_properties=exomiser_application_properties, 4239 ) 4240 4241 # Force annotation 4242 force_update_annotation = True 4243 4244 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4245 log.debug("Start annotation Exomiser") 4246 4247 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4248 4249 # tmp_dir = "/tmp/exomiser" 4250 4251 ### ANALYSIS ### 4252 ################ 4253 4254 # Create analysis.json through analysis dict 4255 # either analysis in param or by default 4256 # depending on preset exome/genome) 4257 4258 # Init analysis dict 4259 param_exomiser_analysis_dict = {} 4260 4261 # analysis from param 4262 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4263 param_exomiser_analysis = full_path(param_exomiser_analysis) 4264 4265 # If analysis in param -> load anlaysis json 4266 if param_exomiser_analysis: 4267 4268 # If param analysis is a file and exists 4269 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4270 param_exomiser_analysis 4271 ): 4272 # Load analysis file into analysis dict (either yaml or json) 4273 with open(param_exomiser_analysis) as json_file: 4274 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4275 4276 # If param analysis is a dict 4277 elif isinstance(param_exomiser_analysis, dict): 4278 # Load analysis dict into analysis dict (either yaml or json) 4279 param_exomiser_analysis_dict = param_exomiser_analysis 4280 4281 # Error analysis type 4282 else: 4283 log.error(f"Analysis type unknown. Check param file.") 4284 raise ValueError(f"Analysis type unknown. Check param file.") 4285 4286 # Case no input analysis config file/dict 4287 # Use preset (exome/genome) to open default config file 4288 if not param_exomiser_analysis_dict: 4289 4290 # default preset 4291 default_preset = "exome" 4292 4293 # Get param preset or default preset 4294 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4295 4296 # Try to find if preset is a file 4297 if os.path.exists(param_exomiser_preset): 4298 # Preset file is provided in full path 4299 param_exomiser_analysis_default_config_file = ( 4300 param_exomiser_preset 4301 ) 4302 # elif os.path.exists(full_path(param_exomiser_preset)): 4303 # # Preset file is provided in full path 4304 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4305 elif os.path.exists( 4306 os.path.join(folder_config, param_exomiser_preset) 4307 ): 4308 # Preset file is provided a basename in config folder (can be a path with subfolders) 4309 param_exomiser_analysis_default_config_file = os.path.join( 4310 folder_config, param_exomiser_preset 4311 ) 4312 else: 4313 # Construct preset file 4314 param_exomiser_analysis_default_config_file = os.path.join( 4315 folder_config, 4316 f"preset-{param_exomiser_preset}-analysis.json", 4317 ) 4318 4319 # If preset file exists 4320 param_exomiser_analysis_default_config_file = full_path( 4321 param_exomiser_analysis_default_config_file 4322 ) 4323 if os.path.exists(param_exomiser_analysis_default_config_file): 4324 # Load prest file into analysis dict (either yaml or json) 4325 with open( 4326 param_exomiser_analysis_default_config_file 4327 ) as json_file: 4328 # param_exomiser_analysis_dict[""] = json.load(json_file) 4329 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4330 json_file 4331 ) 4332 4333 # Error preset file 4334 else: 4335 log.error( 4336 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4337 ) 4338 raise ValueError( 4339 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4340 ) 4341 4342 # If no analysis dict created 4343 if not param_exomiser_analysis_dict: 4344 log.error(f"No analysis config") 4345 raise ValueError(f"No analysis config") 4346 4347 # Log 4348 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4349 4350 ### PHENOPACKET ### 4351 ################### 4352 4353 # If no PhenoPacket in analysis dict -> check in param 4354 if "phenopacket" not in param_exomiser_analysis_dict: 4355 4356 # If PhenoPacket in param -> load anlaysis json 4357 if param_exomiser.get("phenopacket", None): 4358 4359 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4360 param_exomiser_phenopacket = full_path( 4361 param_exomiser_phenopacket 4362 ) 4363 4364 # If param phenopacket is a file and exists 4365 if isinstance( 4366 param_exomiser_phenopacket, str 4367 ) and os.path.exists(param_exomiser_phenopacket): 4368 # Load phenopacket file into analysis dict (either yaml or json) 4369 with open(param_exomiser_phenopacket) as json_file: 4370 param_exomiser_analysis_dict["phenopacket"] = ( 4371 yaml.safe_load(json_file) 4372 ) 4373 4374 # If param phenopacket is a dict 4375 elif isinstance(param_exomiser_phenopacket, dict): 4376 # Load phenopacket dict into analysis dict (either yaml or json) 4377 param_exomiser_analysis_dict["phenopacket"] = ( 4378 param_exomiser_phenopacket 4379 ) 4380 4381 # Error phenopacket type 4382 else: 4383 log.error(f"Phenopacket type unknown. Check param file.") 4384 raise ValueError( 4385 f"Phenopacket type unknown. Check param file." 4386 ) 4387 4388 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4389 if "phenopacket" not in param_exomiser_analysis_dict: 4390 4391 # Init PhenoPacket 4392 param_exomiser_analysis_dict["phenopacket"] = { 4393 "id": "analysis", 4394 "proband": {}, 4395 } 4396 4397 ### Add subject ### 4398 4399 # If subject exists 4400 param_exomiser_subject = param_exomiser.get("subject", {}) 4401 4402 # If subject not exists -> found sample ID 4403 if not param_exomiser_subject: 4404 4405 # Found sample ID in param 4406 sample = param_exomiser.get("sample", None) 4407 4408 # Find sample ID (first sample) 4409 if not sample: 4410 sample_list = self.get_header_sample_list() 4411 if len(sample_list) > 0: 4412 sample = sample_list[0] 4413 else: 4414 log.error(f"No sample found") 4415 raise ValueError(f"No sample found") 4416 4417 # Create subject 4418 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4419 4420 # Add to dict 4421 param_exomiser_analysis_dict["phenopacket"][ 4422 "subject" 4423 ] = param_exomiser_subject 4424 4425 ### Add "phenotypicFeatures" ### 4426 4427 # If phenotypicFeatures exists 4428 param_exomiser_phenotypicfeatures = param_exomiser.get( 4429 "phenotypicFeatures", [] 4430 ) 4431 4432 # If phenotypicFeatures not exists -> Try to infer from hpo list 4433 if not param_exomiser_phenotypicfeatures: 4434 4435 # Found HPO in param 4436 param_exomiser_hpo = param_exomiser.get("hpo", []) 4437 4438 # Split HPO if list in string format separated by comma 4439 if isinstance(param_exomiser_hpo, str): 4440 param_exomiser_hpo = param_exomiser_hpo.split(",") 4441 4442 # Create HPO list 4443 for hpo in param_exomiser_hpo: 4444 hpo_clean = re.sub("[^0-9]", "", hpo) 4445 param_exomiser_phenotypicfeatures.append( 4446 { 4447 "type": { 4448 "id": f"HP:{hpo_clean}", 4449 "label": f"HP:{hpo_clean}", 4450 } 4451 } 4452 ) 4453 4454 # Add to dict 4455 param_exomiser_analysis_dict["phenopacket"][ 4456 "phenotypicFeatures" 4457 ] = param_exomiser_phenotypicfeatures 4458 4459 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4460 if not param_exomiser_phenotypicfeatures: 4461 for step in param_exomiser_analysis_dict.get( 4462 "analysis", {} 4463 ).get("steps", []): 4464 if "hiPhivePrioritiser" in step: 4465 param_exomiser_analysis_dict.get("analysis", {}).get( 4466 "steps", [] 4467 ).remove(step) 4468 4469 ### Add Input File ### 4470 4471 # Initial file name and htsFiles 4472 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4473 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4474 { 4475 "uri": tmp_vcf_name, 4476 "htsFormat": "VCF", 4477 "genomeAssembly": assembly, 4478 } 4479 ] 4480 4481 ### Add metaData ### 4482 4483 # If metaData not in analysis dict 4484 if "metaData" not in param_exomiser_analysis_dict: 4485 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4486 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4487 "createdBy": "howard", 4488 "phenopacketSchemaVersion": 1, 4489 } 4490 4491 ### OutputOptions ### 4492 4493 # Init output result folder 4494 output_results = os.path.join(tmp_dir, "results") 4495 4496 # If no outputOptions in analysis dict 4497 if "outputOptions" not in param_exomiser_analysis_dict: 4498 4499 # default output formats 4500 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4501 4502 # Get outputOptions in param 4503 output_options = param_exomiser.get("outputOptions", None) 4504 4505 # If no output_options in param -> check 4506 if not output_options: 4507 output_options = { 4508 "outputContributingVariantsOnly": False, 4509 "numGenes": 0, 4510 "outputFormats": defaut_output_formats, 4511 } 4512 4513 # Replace outputDirectory in output options 4514 output_options["outputDirectory"] = output_results 4515 output_options["outputFileName"] = "howard" 4516 4517 # Add outputOptions in analysis dict 4518 param_exomiser_analysis_dict["outputOptions"] = output_options 4519 4520 else: 4521 4522 # Replace output_results and output format (if exists in param) 4523 param_exomiser_analysis_dict["outputOptions"][ 4524 "outputDirectory" 4525 ] = output_results 4526 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4527 list( 4528 set( 4529 param_exomiser_analysis_dict.get( 4530 "outputOptions", {} 4531 ).get("outputFormats", []) 4532 + ["TSV_VARIANT", "VCF"] 4533 ) 4534 ) 4535 ) 4536 4537 # log 4538 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4539 4540 ### ANALYSIS FILE ### 4541 ##################### 4542 4543 ### Full JSON analysis config file ### 4544 4545 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4546 with open(exomiser_analysis, "w") as fp: 4547 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4548 4549 ### SPLIT analysis and sample config files 4550 4551 # Splitted analysis dict 4552 param_exomiser_analysis_dict_for_split = ( 4553 param_exomiser_analysis_dict.copy() 4554 ) 4555 4556 # Phenopacket JSON file 4557 exomiser_analysis_phenopacket = os.path.join( 4558 tmp_dir, "analysis_phenopacket.json" 4559 ) 4560 with open(exomiser_analysis_phenopacket, "w") as fp: 4561 json.dump( 4562 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4563 fp, 4564 indent=4, 4565 ) 4566 4567 # Analysis JSON file without Phenopacket parameters 4568 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4569 exomiser_analysis_analysis = os.path.join( 4570 tmp_dir, "analysis_analysis.json" 4571 ) 4572 with open(exomiser_analysis_analysis, "w") as fp: 4573 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4574 4575 ### INITAL VCF file ### 4576 ####################### 4577 4578 ### Create list of samples to use and include inti initial VCF file #### 4579 4580 # Subject (main sample) 4581 # Get sample ID in analysis dict 4582 sample_subject = ( 4583 param_exomiser_analysis_dict.get("phenopacket", {}) 4584 .get("subject", {}) 4585 .get("id", None) 4586 ) 4587 sample_proband = ( 4588 param_exomiser_analysis_dict.get("phenopacket", {}) 4589 .get("proband", {}) 4590 .get("subject", {}) 4591 .get("id", None) 4592 ) 4593 sample = [] 4594 if sample_subject: 4595 sample.append(sample_subject) 4596 if sample_proband: 4597 sample.append(sample_proband) 4598 4599 # Get sample ID within Pedigree 4600 pedigree_persons_list = ( 4601 param_exomiser_analysis_dict.get("phenopacket", {}) 4602 .get("pedigree", {}) 4603 .get("persons", {}) 4604 ) 4605 4606 # Create list with all sample ID in pedigree (if exists) 4607 pedigree_persons = [] 4608 for person in pedigree_persons_list: 4609 pedigree_persons.append(person.get("individualId")) 4610 4611 # Concat subject sample ID and samples ID in pedigreesamples 4612 samples = list(set(sample + pedigree_persons)) 4613 4614 # Check if sample list is not empty 4615 if not samples: 4616 log.error(f"No samples found") 4617 raise ValueError(f"No samples found") 4618 4619 # Create VCF with sample (either sample in param or first one by default) 4620 # Export VCF file 4621 self.export_variant_vcf( 4622 vcf_file=tmp_vcf_name, 4623 remove_info=True, 4624 add_samples=True, 4625 list_samples=samples, 4626 index=False, 4627 ) 4628 4629 ### Execute Exomiser ### 4630 ######################## 4631 4632 # Init command 4633 exomiser_command = "" 4634 4635 # Command exomiser options 4636 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4637 4638 # Release 4639 exomiser_release = param_exomiser.get("release", None) 4640 if exomiser_release: 4641 # phenotype data version 4642 exomiser_options += ( 4643 f" --exomiser.phenotype.data-version={exomiser_release} " 4644 ) 4645 # data version 4646 exomiser_options += ( 4647 f" --exomiser.{assembly}.data-version={exomiser_release} " 4648 ) 4649 # variant white list 4650 variant_white_list_file = ( 4651 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4652 ) 4653 if os.path.exists( 4654 os.path.join( 4655 databases_folders, assembly, variant_white_list_file 4656 ) 4657 ): 4658 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4659 4660 # transcript_source 4661 transcript_source = param_exomiser.get( 4662 "transcript_source", None 4663 ) # ucsc, refseq, ensembl 4664 if transcript_source: 4665 exomiser_options += ( 4666 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4667 ) 4668 4669 # If analysis contain proband param 4670 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4671 "proband", {} 4672 ): 4673 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4674 4675 # If no proband (usually uniq sample) 4676 else: 4677 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4678 4679 # Log 4680 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4681 4682 # Run command 4683 result = subprocess.call( 4684 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4685 ) 4686 if result: 4687 log.error("Exomiser command failed") 4688 raise ValueError("Exomiser command failed") 4689 4690 ### RESULTS ### 4691 ############### 4692 4693 ### Annotate with TSV fields ### 4694 4695 # Init result tsv file 4696 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4697 4698 # Init result tsv file 4699 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4700 4701 # Parse TSV file and explode columns in INFO field 4702 if exomiser_to_info and os.path.exists(output_results_tsv): 4703 4704 # Log 4705 log.debug("Exomiser columns to VCF INFO field") 4706 4707 # Retrieve columns and types 4708 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4709 output_results_tsv_df = self.get_query_to_df(query) 4710 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4711 4712 # Init concat fields for update 4713 sql_query_update_concat_fields = [] 4714 4715 # Fields to avoid 4716 fields_to_avoid = [ 4717 "CONTIG", 4718 "START", 4719 "END", 4720 "REF", 4721 "ALT", 4722 "QUAL", 4723 "FILTER", 4724 "GENOTYPE", 4725 ] 4726 4727 # List all columns to add into header 4728 for header_column in output_results_tsv_columns: 4729 4730 # If header column is enable 4731 if header_column not in fields_to_avoid: 4732 4733 # Header info type 4734 header_info_type = "String" 4735 header_column_df = output_results_tsv_df[header_column] 4736 header_column_df_dtype = header_column_df.dtype 4737 if header_column_df_dtype == object: 4738 if ( 4739 pd.to_numeric(header_column_df, errors="coerce") 4740 .notnull() 4741 .all() 4742 ): 4743 header_info_type = "Float" 4744 else: 4745 header_info_type = "Integer" 4746 4747 # Header info 4748 characters_to_validate = ["-"] 4749 pattern = "[" + "".join(characters_to_validate) + "]" 4750 header_info_name = re.sub( 4751 pattern, 4752 "_", 4753 f"Exomiser_{header_column}".replace("#", ""), 4754 ) 4755 header_info_number = "." 4756 header_info_description = ( 4757 f"Exomiser {header_column} annotation" 4758 ) 4759 header_info_source = "Exomiser" 4760 header_info_version = "unknown" 4761 header_info_code = CODE_TYPE_MAP[header_info_type] 4762 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4763 header_info_name, 4764 header_info_number, 4765 header_info_type, 4766 header_info_description, 4767 header_info_source, 4768 header_info_version, 4769 header_info_code, 4770 ) 4771 4772 # Add field to add for update to concat fields 4773 sql_query_update_concat_fields.append( 4774 f""" 4775 CASE 4776 WHEN table_parquet."{header_column}" NOT IN ('','.') 4777 THEN concat( 4778 '{header_info_name}=', 4779 table_parquet."{header_column}", 4780 ';' 4781 ) 4782 4783 ELSE '' 4784 END 4785 """ 4786 ) 4787 4788 # Update query 4789 sql_query_update = f""" 4790 UPDATE {table_variants} as table_variants 4791 SET INFO = concat( 4792 CASE 4793 WHEN INFO NOT IN ('', '.') 4794 THEN INFO 4795 ELSE '' 4796 END, 4797 CASE 4798 WHEN table_variants.INFO NOT IN ('','.') 4799 THEN ';' 4800 ELSE '' 4801 END, 4802 ( 4803 SELECT 4804 concat( 4805 {",".join(sql_query_update_concat_fields)} 4806 ) 4807 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4808 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4809 AND table_parquet.\"START\" = table_variants.\"POS\" 4810 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4811 AND table_parquet.\"REF\" = table_variants.\"REF\" 4812 ) 4813 ) 4814 ; 4815 """ 4816 4817 # Update 4818 self.conn.execute(sql_query_update) 4819 4820 ### Annotate with VCF INFO field ### 4821 4822 # Init result VCF file 4823 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4824 4825 # If VCF exists 4826 if os.path.exists(output_results_vcf): 4827 4828 # Log 4829 log.debug("Exomiser result VCF update variants") 4830 4831 # Find Exomiser INFO field annotation in header 4832 with gzip.open(output_results_vcf, "rt") as f: 4833 header_list = self.read_vcf_header(f) 4834 exomiser_vcf_header = vcf.Reader( 4835 io.StringIO("\n".join(header_list)) 4836 ) 4837 4838 # Add annotation INFO field to header 4839 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4840 4841 # Update variants with VCF 4842 self.update_from_vcf(output_results_vcf) 4843 4844 return True
This function annotate with Exomiser
This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
- "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
- "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
- "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
- "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
- "sample" (string):
Sample name to construct "subject" section:
"subject":
{
"id": "
", "sex": "UNKNOWN_SEX" } Default: None - "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
- "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
- "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
- "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
- "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
- "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
- "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).
Notes:
- If no sample in parameters, first sample in VCF will be chosen
- If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
- threads: The number of threads to use
Returns
None.
4846 def annotation_snpeff(self, threads: int = None) -> None: 4847 """ 4848 This function annotate with snpEff 4849 4850 :param threads: The number of threads to use 4851 :return: the value of the variable "return_value". 4852 """ 4853 4854 # DEBUG 4855 log.debug("Start annotation with snpeff databases") 4856 4857 # Threads 4858 if not threads: 4859 threads = self.get_threads() 4860 log.debug("Threads: " + str(threads)) 4861 4862 # DEBUG 4863 delete_tmp = True 4864 if self.get_config().get("verbosity", "warning") in ["debug"]: 4865 delete_tmp = False 4866 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4867 4868 # Config 4869 config = self.get_config() 4870 log.debug("Config: " + str(config)) 4871 4872 # Config - Folders - Databases 4873 databases_folders = ( 4874 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4875 ) 4876 log.debug("Databases annotations: " + str(databases_folders)) 4877 4878 # # Config - Java 4879 # java_bin = get_bin( 4880 # tool="java", 4881 # bin="java", 4882 # bin_type="bin", 4883 # config=config, 4884 # default_folder="/usr/bin", 4885 # ) 4886 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4887 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4888 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4889 4890 # # Config - snpEff bin 4891 # snpeff_jar = get_bin( 4892 # tool="snpeff", 4893 # bin="snpEff.jar", 4894 # bin_type="jar", 4895 # config=config, 4896 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4897 # ) 4898 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4899 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4900 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4901 4902 # Config - snpEff bin command 4903 snpeff_bin_command = get_bin_command( 4904 bin="snpEff.jar", 4905 tool="snpeff", 4906 bin_type="jar", 4907 config=config, 4908 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4909 ) 4910 if not snpeff_bin_command: 4911 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4912 log.error(msg_err) 4913 raise ValueError(msg_err) 4914 4915 # Config - snpEff databases 4916 snpeff_databases = ( 4917 config.get("folders", {}) 4918 .get("databases", {}) 4919 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4920 ) 4921 snpeff_databases = full_path(snpeff_databases) 4922 if snpeff_databases is not None and snpeff_databases != "": 4923 log.debug(f"Create snpEff databases folder") 4924 if not os.path.exists(snpeff_databases): 4925 os.makedirs(snpeff_databases) 4926 4927 # Param 4928 param = self.get_param() 4929 log.debug("Param: " + str(param)) 4930 4931 # Param 4932 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4933 log.debug("Options: " + str(options)) 4934 4935 # Param - Assembly 4936 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4937 4938 # Param - Options 4939 snpeff_options = ( 4940 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4941 ) 4942 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4943 snpeff_csvstats = ( 4944 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4945 ) 4946 if snpeff_stats: 4947 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4948 snpeff_stats = full_path(snpeff_stats) 4949 snpeff_options += f" -stats {snpeff_stats}" 4950 if snpeff_csvstats: 4951 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4952 snpeff_csvstats = full_path(snpeff_csvstats) 4953 snpeff_options += f" -csvStats {snpeff_csvstats}" 4954 4955 # Data 4956 table_variants = self.get_table_variants() 4957 4958 # Check if not empty 4959 log.debug("Check if not empty") 4960 sql_query_chromosomes = ( 4961 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4962 ) 4963 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4964 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4965 log.info(f"VCF empty") 4966 return 4967 4968 # Export in VCF 4969 log.debug("Create initial file to annotate") 4970 tmp_vcf = NamedTemporaryFile( 4971 prefix=self.get_prefix(), 4972 dir=self.get_tmp_dir(), 4973 suffix=".vcf.gz", 4974 delete=True, 4975 ) 4976 tmp_vcf_name = tmp_vcf.name 4977 4978 # VCF header 4979 vcf_reader = self.get_header() 4980 log.debug("Initial header: " + str(vcf_reader.infos)) 4981 4982 # Existing annotations 4983 for vcf_annotation in self.get_header().infos: 4984 4985 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4986 log.debug( 4987 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4988 ) 4989 4990 # Memory limit 4991 # if config.get("memory", None): 4992 # memory_limit = config.get("memory", "8G") 4993 # else: 4994 # memory_limit = "8G" 4995 memory_limit = self.get_memory("8G") 4996 log.debug(f"memory_limit: {memory_limit}") 4997 4998 # snpEff java options 4999 snpeff_java_options = ( 5000 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5001 ) 5002 log.debug(f"Exomiser java options: {snpeff_java_options}") 5003 5004 force_update_annotation = True 5005 5006 if "ANN" not in self.get_header().infos or force_update_annotation: 5007 5008 # Check snpEff database 5009 log.debug(f"Check snpEff databases {[assembly]}") 5010 databases_download_snpeff( 5011 folder=snpeff_databases, assemblies=[assembly], config=config 5012 ) 5013 5014 # Export VCF file 5015 self.export_variant_vcf( 5016 vcf_file=tmp_vcf_name, 5017 remove_info=True, 5018 add_samples=False, 5019 index=True, 5020 ) 5021 5022 # Tmp file 5023 err_files = [] 5024 tmp_annotate_vcf = NamedTemporaryFile( 5025 prefix=self.get_prefix(), 5026 dir=self.get_tmp_dir(), 5027 suffix=".vcf", 5028 delete=False, 5029 ) 5030 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5031 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5032 err_files.append(tmp_annotate_vcf_name_err) 5033 5034 # Command 5035 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5036 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5037 run_parallel_commands([snpeff_command], 1) 5038 5039 # Error messages 5040 log.info(f"Error/Warning messages:") 5041 error_message_command_all = [] 5042 error_message_command_warning = [] 5043 error_message_command_err = [] 5044 for err_file in err_files: 5045 with open(err_file, "r") as f: 5046 for line in f: 5047 message = line.strip() 5048 error_message_command_all.append(message) 5049 if line.startswith("[W::"): 5050 error_message_command_warning.append(message) 5051 if line.startswith("[E::"): 5052 error_message_command_err.append(f"{err_file}: " + message) 5053 # log info 5054 for message in list( 5055 set(error_message_command_err + error_message_command_warning) 5056 ): 5057 log.info(f" {message}") 5058 # debug info 5059 for message in list(set(error_message_command_all)): 5060 log.debug(f" {message}") 5061 # failed 5062 if len(error_message_command_err): 5063 log.error("Annotation failed: Error in commands") 5064 raise ValueError("Annotation failed: Error in commands") 5065 5066 # Find annotation in header 5067 with open(tmp_annotate_vcf_name, "rt") as f: 5068 header_list = self.read_vcf_header(f) 5069 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5070 5071 for ann in annovar_vcf_header.infos: 5072 if ann not in self.get_header().infos: 5073 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5074 5075 # Update variants 5076 log.info(f"Annotation - Updating...") 5077 self.update_from_vcf(tmp_annotate_vcf_name) 5078 5079 else: 5080 if "ANN" in self.get_header().infos: 5081 log.debug(f"Existing snpEff annotations in VCF") 5082 if force_update_annotation: 5083 log.debug(f"Existing snpEff annotations in VCF - annotation forced")
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
5085 def annotation_annovar(self, threads: int = None) -> None: 5086 """ 5087 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5088 annotations 5089 5090 :param threads: number of threads to use 5091 :return: the value of the variable "return_value". 5092 """ 5093 5094 # DEBUG 5095 log.debug("Start annotation with Annovar databases") 5096 5097 # Threads 5098 if not threads: 5099 threads = self.get_threads() 5100 log.debug("Threads: " + str(threads)) 5101 5102 # Tmp en Err files 5103 tmp_files = [] 5104 err_files = [] 5105 5106 # DEBUG 5107 delete_tmp = True 5108 if self.get_config().get("verbosity", "warning") in ["debug"]: 5109 delete_tmp = False 5110 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5111 5112 # Config 5113 config = self.get_config() 5114 log.debug("Config: " + str(config)) 5115 5116 # Config - Folders - Databases 5117 databases_folders = ( 5118 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5119 ) 5120 log.debug("Databases annotations: " + str(databases_folders)) 5121 5122 # Config - annovar bin command 5123 annovar_bin_command = get_bin_command( 5124 bin="table_annovar.pl", 5125 tool="annovar", 5126 bin_type="perl", 5127 config=config, 5128 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5129 ) 5130 if not annovar_bin_command: 5131 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5132 log.error(msg_err) 5133 raise ValueError(msg_err) 5134 5135 # Config - BCFTools bin command 5136 bcftools_bin_command = get_bin_command( 5137 bin="bcftools", 5138 tool="bcftools", 5139 bin_type="bin", 5140 config=config, 5141 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5142 ) 5143 if not bcftools_bin_command: 5144 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5145 log.error(msg_err) 5146 raise ValueError(msg_err) 5147 5148 # Config - annovar databases 5149 annovar_databases = ( 5150 config.get("folders", {}) 5151 .get("databases", {}) 5152 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5153 ) 5154 annovar_databases = full_path(annovar_databases) 5155 if annovar_databases != "" and not os.path.exists(annovar_databases): 5156 os.makedirs(annovar_databases) 5157 5158 # Param 5159 param = self.get_param() 5160 log.debug("Param: " + str(param)) 5161 5162 # Param - options 5163 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5164 log.debug("Options: " + str(options)) 5165 5166 # Param - annotations 5167 annotations = ( 5168 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5169 ) 5170 log.debug("Annotations: " + str(annotations)) 5171 5172 # Param - Assembly 5173 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5174 5175 # Annovar database assembly 5176 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5177 if annovar_databases_assembly != "" and not os.path.exists( 5178 annovar_databases_assembly 5179 ): 5180 os.makedirs(annovar_databases_assembly) 5181 5182 # Data 5183 table_variants = self.get_table_variants() 5184 5185 # Check if not empty 5186 log.debug("Check if not empty") 5187 sql_query_chromosomes = ( 5188 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5189 ) 5190 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5191 if not sql_query_chromosomes_df["count"][0]: 5192 log.info(f"VCF empty") 5193 return 5194 5195 # VCF header 5196 vcf_reader = self.get_header() 5197 log.debug("Initial header: " + str(vcf_reader.infos)) 5198 5199 # Existing annotations 5200 for vcf_annotation in self.get_header().infos: 5201 5202 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5203 log.debug( 5204 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5205 ) 5206 5207 force_update_annotation = True 5208 5209 if annotations: 5210 5211 commands = [] 5212 tmp_annotates_vcf_name_list = [] 5213 5214 # Export in VCF 5215 log.debug("Create initial file to annotate") 5216 tmp_vcf = NamedTemporaryFile( 5217 prefix=self.get_prefix(), 5218 dir=self.get_tmp_dir(), 5219 suffix=".vcf.gz", 5220 delete=False, 5221 ) 5222 tmp_vcf_name = tmp_vcf.name 5223 tmp_files.append(tmp_vcf_name) 5224 tmp_files.append(tmp_vcf_name + ".tbi") 5225 5226 # Export VCF file 5227 self.export_variant_vcf( 5228 vcf_file=tmp_vcf_name, 5229 remove_info=".", 5230 add_samples=False, 5231 index=True, 5232 ) 5233 5234 # Create file for field rename 5235 log.debug("Create file for field rename") 5236 tmp_rename = NamedTemporaryFile( 5237 prefix=self.get_prefix(), 5238 dir=self.get_tmp_dir(), 5239 suffix=".rename", 5240 delete=False, 5241 ) 5242 tmp_rename_name = tmp_rename.name 5243 tmp_files.append(tmp_rename_name) 5244 5245 # Check Annovar database 5246 log.debug( 5247 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5248 ) 5249 databases_download_annovar( 5250 folder=annovar_databases, 5251 files=list(annotations.keys()), 5252 assemblies=[assembly], 5253 ) 5254 5255 for annotation in annotations: 5256 annotation_fields = annotations[annotation] 5257 5258 if not annotation_fields: 5259 annotation_fields = {"INFO": None} 5260 5261 log.info(f"Annotations Annovar - database '{annotation}'") 5262 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5263 5264 # Tmp file for annovar 5265 err_files = [] 5266 tmp_annotate_vcf_directory = TemporaryDirectory( 5267 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5268 ) 5269 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5270 tmp_annotate_vcf_name_annovar = ( 5271 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5272 ) 5273 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5274 err_files.append(tmp_annotate_vcf_name_err) 5275 tmp_files.append(tmp_annotate_vcf_name_err) 5276 5277 # Tmp file final vcf annotated by annovar 5278 tmp_annotate_vcf = NamedTemporaryFile( 5279 prefix=self.get_prefix(), 5280 dir=self.get_tmp_dir(), 5281 suffix=".vcf.gz", 5282 delete=False, 5283 ) 5284 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5285 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5286 tmp_files.append(tmp_annotate_vcf_name) 5287 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5288 5289 # Number of fields 5290 annotation_list = [] 5291 annotation_renamed_list = [] 5292 5293 for annotation_field in annotation_fields: 5294 5295 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5296 annotation_fields_new_name = annotation_fields.get( 5297 annotation_field, annotation_field 5298 ) 5299 if not annotation_fields_new_name: 5300 annotation_fields_new_name = annotation_field 5301 5302 if ( 5303 force_update_annotation 5304 or annotation_fields_new_name not in self.get_header().infos 5305 ): 5306 annotation_list.append(annotation_field) 5307 annotation_renamed_list.append(annotation_fields_new_name) 5308 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5309 log.warning( 5310 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5311 ) 5312 5313 # Add rename info 5314 run_parallel_commands( 5315 [ 5316 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5317 ], 5318 1, 5319 ) 5320 5321 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5322 log.debug("annotation_list: " + str(annotation_list)) 5323 5324 # protocol 5325 protocol = annotation 5326 5327 # argument 5328 argument = "" 5329 5330 # operation 5331 operation = "f" 5332 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5333 "ensGene" 5334 ): 5335 operation = "g" 5336 if options.get("genebase", None): 5337 argument = f"""'{options.get("genebase","")}'""" 5338 elif annotation in ["cytoBand"]: 5339 operation = "r" 5340 5341 # argument option 5342 argument_option = "" 5343 if argument != "": 5344 argument_option = " --argument " + argument 5345 5346 # command options 5347 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5348 for option in options: 5349 if option not in ["genebase"]: 5350 command_options += f""" --{option}={options[option]}""" 5351 5352 # Command 5353 5354 # Command - Annovar 5355 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5356 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5357 5358 # Command - start pipe 5359 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5360 5361 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5362 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5363 5364 # Command - Special characters (refGene annotation) 5365 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5366 5367 # Command - Clean empty fields (with value ".") 5368 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5369 5370 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5371 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5372 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5373 # for ann in annotation_renamed_list: 5374 for ann in annotation_list: 5375 annovar_fields_to_keep.append(f"^INFO/{ann}") 5376 5377 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5378 5379 # Command - indexing 5380 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5381 5382 log.debug(f"Annotation - Annovar command: {command_annovar}") 5383 run_parallel_commands([command_annovar], 1) 5384 5385 # Error messages 5386 log.info(f"Error/Warning messages:") 5387 error_message_command_all = [] 5388 error_message_command_warning = [] 5389 error_message_command_err = [] 5390 for err_file in err_files: 5391 with open(err_file, "r") as f: 5392 for line in f: 5393 message = line.strip() 5394 error_message_command_all.append(message) 5395 if line.startswith("[W::") or line.startswith("WARNING"): 5396 error_message_command_warning.append(message) 5397 if line.startswith("[E::") or line.startswith("ERROR"): 5398 error_message_command_err.append( 5399 f"{err_file}: " + message 5400 ) 5401 # log info 5402 for message in list( 5403 set(error_message_command_err + error_message_command_warning) 5404 ): 5405 log.info(f" {message}") 5406 # debug info 5407 for message in list(set(error_message_command_all)): 5408 log.debug(f" {message}") 5409 # failed 5410 if len(error_message_command_err): 5411 log.error("Annotation failed: Error in commands") 5412 raise ValueError("Annotation failed: Error in commands") 5413 5414 if tmp_annotates_vcf_name_list: 5415 5416 # List of annotated files 5417 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5418 5419 # Tmp file 5420 tmp_annotate_vcf = NamedTemporaryFile( 5421 prefix=self.get_prefix(), 5422 dir=self.get_tmp_dir(), 5423 suffix=".vcf.gz", 5424 delete=False, 5425 ) 5426 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5427 tmp_files.append(tmp_annotate_vcf_name) 5428 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5429 err_files.append(tmp_annotate_vcf_name_err) 5430 tmp_files.append(tmp_annotate_vcf_name_err) 5431 5432 # Command merge 5433 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5434 log.info( 5435 f"Annotation Annovar - Annotation merging " 5436 + str(len(tmp_annotates_vcf_name_list)) 5437 + " annotated files" 5438 ) 5439 log.debug(f"Annotation - merge command: {merge_command}") 5440 run_parallel_commands([merge_command], 1) 5441 5442 # Find annotation in header 5443 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5444 header_list = self.read_vcf_header(f) 5445 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5446 5447 for ann in annovar_vcf_header.infos: 5448 if ann not in self.get_header().infos: 5449 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5450 5451 # Update variants 5452 log.info(f"Annotation Annovar - Updating...") 5453 self.update_from_vcf(tmp_annotate_vcf_name) 5454 5455 # Clean files 5456 # Tmp file remove command 5457 if True: 5458 tmp_files_remove_command = "" 5459 if tmp_files: 5460 tmp_files_remove_command = " ".join(tmp_files) 5461 clean_command = f" rm -f {tmp_files_remove_command} " 5462 log.debug(f"Annotation Annovar - Annotation cleaning ") 5463 log.debug(f"Annotation - cleaning command: {clean_command}") 5464 run_parallel_commands([clean_command], 1)
It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations
Parameters
- threads: number of threads to use
Returns
the value of the variable "return_value".
5467 def annotation_parquet(self, threads: int = None) -> None: 5468 """ 5469 It takes a VCF file, and annotates it with a parquet file 5470 5471 :param threads: number of threads to use for the annotation 5472 :return: the value of the variable "result". 5473 """ 5474 5475 # DEBUG 5476 log.debug("Start annotation with parquet databases") 5477 5478 # Threads 5479 if not threads: 5480 threads = self.get_threads() 5481 log.debug("Threads: " + str(threads)) 5482 5483 # DEBUG 5484 delete_tmp = True 5485 if self.get_config().get("verbosity", "warning") in ["debug"]: 5486 delete_tmp = False 5487 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5488 5489 # Config 5490 databases_folders = set( 5491 self.get_config() 5492 .get("folders", {}) 5493 .get("databases", {}) 5494 .get("annotations", ["."]) 5495 + self.get_config() 5496 .get("folders", {}) 5497 .get("databases", {}) 5498 .get("parquet", ["."]) 5499 ) 5500 log.debug("Databases annotations: " + str(databases_folders)) 5501 5502 # Param 5503 annotations = ( 5504 self.get_param() 5505 .get("annotation", {}) 5506 .get("parquet", {}) 5507 .get("annotations", None) 5508 ) 5509 log.debug("Annotations: " + str(annotations)) 5510 5511 # Assembly 5512 assembly = self.get_param().get( 5513 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5514 ) 5515 5516 # Force Update Annotation 5517 force_update_annotation = ( 5518 self.get_param() 5519 .get("annotation", {}) 5520 .get("options", {}) 5521 .get("annotations_update", False) 5522 ) 5523 log.debug(f"force_update_annotation={force_update_annotation}") 5524 force_append_annotation = ( 5525 self.get_param() 5526 .get("annotation", {}) 5527 .get("options", {}) 5528 .get("annotations_append", False) 5529 ) 5530 log.debug(f"force_append_annotation={force_append_annotation}") 5531 5532 # Data 5533 table_variants = self.get_table_variants() 5534 5535 # Check if not empty 5536 log.debug("Check if not empty") 5537 sql_query_chromosomes_df = self.get_query_to_df( 5538 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5539 ) 5540 if not sql_query_chromosomes_df["count"][0]: 5541 log.info(f"VCF empty") 5542 return 5543 5544 # VCF header 5545 vcf_reader = self.get_header() 5546 log.debug("Initial header: " + str(vcf_reader.infos)) 5547 5548 # Nb Variants POS 5549 log.debug("NB Variants Start") 5550 nb_variants = self.conn.execute( 5551 f"SELECT count(*) AS count FROM variants" 5552 ).fetchdf()["count"][0] 5553 log.debug("NB Variants Stop") 5554 5555 # Existing annotations 5556 for vcf_annotation in self.get_header().infos: 5557 5558 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5559 log.debug( 5560 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5561 ) 5562 5563 # Added columns 5564 added_columns = [] 5565 5566 # drop indexes 5567 log.debug(f"Drop indexes...") 5568 self.drop_indexes() 5569 5570 if annotations: 5571 5572 if "ALL" in annotations: 5573 5574 all_param = annotations.get("ALL", {}) 5575 all_param_formats = all_param.get("formats", None) 5576 all_param_releases = all_param.get("releases", None) 5577 5578 databases_infos_dict = self.scan_databases( 5579 database_formats=all_param_formats, 5580 database_releases=all_param_releases, 5581 ) 5582 for database_infos in databases_infos_dict.keys(): 5583 if database_infos not in annotations: 5584 annotations[database_infos] = {"INFO": None} 5585 5586 for annotation in annotations: 5587 5588 if annotation in ["ALL"]: 5589 continue 5590 5591 # Annotation Name 5592 annotation_name = os.path.basename(annotation) 5593 5594 # Annotation fields 5595 annotation_fields = annotations[annotation] 5596 if not annotation_fields: 5597 annotation_fields = {"INFO": None} 5598 5599 log.debug(f"Annotation '{annotation_name}'") 5600 log.debug( 5601 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5602 ) 5603 5604 # Create Database 5605 database = Database( 5606 database=annotation, 5607 databases_folders=databases_folders, 5608 assembly=assembly, 5609 ) 5610 5611 # Find files 5612 parquet_file = database.get_database() 5613 parquet_hdr_file = database.get_header_file() 5614 parquet_type = database.get_type() 5615 5616 # Check if files exists 5617 if not parquet_file or not parquet_hdr_file: 5618 log.error("Annotation failed: file not found") 5619 raise ValueError("Annotation failed: file not found") 5620 else: 5621 # Get parquet connexion 5622 parquet_sql_attach = database.get_sql_database_attach( 5623 output="query" 5624 ) 5625 if parquet_sql_attach: 5626 self.conn.execute(parquet_sql_attach) 5627 parquet_file_link = database.get_sql_database_link() 5628 # Log 5629 log.debug( 5630 f"Annotation '{annotation_name}' - file: " 5631 + str(parquet_file) 5632 + " and " 5633 + str(parquet_hdr_file) 5634 ) 5635 5636 # Database full header columns 5637 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5638 parquet_hdr_file 5639 ) 5640 # Log 5641 log.debug( 5642 "Annotation database header columns : " 5643 + str(parquet_hdr_vcf_header_columns) 5644 ) 5645 5646 # Load header as VCF object 5647 parquet_hdr_vcf_header_infos = database.get_header().infos 5648 # Log 5649 log.debug( 5650 "Annotation database header: " 5651 + str(parquet_hdr_vcf_header_infos) 5652 ) 5653 5654 # Get extra infos 5655 parquet_columns = database.get_extra_columns() 5656 # Log 5657 log.debug("Annotation database Columns: " + str(parquet_columns)) 5658 5659 # Add extra columns if "ALL" in annotation_fields 5660 # if "ALL" in annotation_fields: 5661 # allow_add_extra_column = True 5662 if "ALL" in annotation_fields and database.get_extra_columns(): 5663 for extra_column in database.get_extra_columns(): 5664 if ( 5665 extra_column not in annotation_fields 5666 and extra_column.replace("INFO/", "") 5667 not in parquet_hdr_vcf_header_infos 5668 ): 5669 parquet_hdr_vcf_header_infos[extra_column] = ( 5670 vcf.parser._Info( 5671 extra_column, 5672 ".", 5673 "String", 5674 f"{extra_column} description", 5675 "unknown", 5676 "unknown", 5677 self.code_type_map["String"], 5678 ) 5679 ) 5680 5681 # For all fields in database 5682 annotation_fields_all = False 5683 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5684 annotation_fields_all = True 5685 annotation_fields = { 5686 key: key for key in parquet_hdr_vcf_header_infos 5687 } 5688 5689 log.debug( 5690 "Annotation database header - All annotations added: " 5691 + str(annotation_fields) 5692 ) 5693 5694 # Init 5695 5696 # List of annotation fields to use 5697 sql_query_annotation_update_info_sets = [] 5698 5699 # List of annotation to agregate 5700 sql_query_annotation_to_agregate = [] 5701 5702 # Number of fields 5703 nb_annotation_field = 0 5704 5705 # Annotation fields processed 5706 annotation_fields_processed = [] 5707 5708 # Columns mapping 5709 map_columns = database.map_columns( 5710 columns=annotation_fields, prefixes=["INFO/"] 5711 ) 5712 5713 # Query dict for fields to remove (update option) 5714 query_dict_remove = {} 5715 5716 # Fetch Anotation fields 5717 for annotation_field in annotation_fields: 5718 5719 # annotation_field_column 5720 annotation_field_column = map_columns.get( 5721 annotation_field, "INFO" 5722 ) 5723 5724 # field new name, if parametered 5725 annotation_fields_new_name = annotation_fields.get( 5726 annotation_field, annotation_field 5727 ) 5728 if not annotation_fields_new_name: 5729 annotation_fields_new_name = annotation_field 5730 5731 # To annotate 5732 # force_update_annotation = True 5733 # force_append_annotation = True 5734 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5735 if annotation_field in parquet_hdr_vcf_header_infos and ( 5736 force_update_annotation 5737 or force_append_annotation 5738 or ( 5739 annotation_fields_new_name 5740 not in self.get_header().infos 5741 ) 5742 ): 5743 5744 # Add field to annotation to process list 5745 annotation_fields_processed.append( 5746 annotation_fields_new_name 5747 ) 5748 5749 # explode infos for the field 5750 annotation_fields_new_name_info_msg = "" 5751 if ( 5752 force_update_annotation 5753 and annotation_fields_new_name 5754 in self.get_header().infos 5755 ): 5756 # Remove field from INFO 5757 query = f""" 5758 UPDATE {table_variants} as table_variants 5759 SET INFO = REGEXP_REPLACE( 5760 concat(table_variants.INFO,''), 5761 ';*{annotation_fields_new_name}=[^;]*', 5762 '' 5763 ) 5764 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5765 """ 5766 annotation_fields_new_name_info_msg = " [update]" 5767 query_dict_remove[ 5768 f"remove 'INFO/{annotation_fields_new_name}'" 5769 ] = query 5770 5771 # Sep between fields in INFO 5772 nb_annotation_field += 1 5773 if nb_annotation_field > 1: 5774 annotation_field_sep = ";" 5775 else: 5776 annotation_field_sep = "" 5777 5778 log.info( 5779 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5780 ) 5781 5782 # Add INFO field to header 5783 parquet_hdr_vcf_header_infos_number = ( 5784 parquet_hdr_vcf_header_infos[annotation_field].num 5785 or "." 5786 ) 5787 parquet_hdr_vcf_header_infos_type = ( 5788 parquet_hdr_vcf_header_infos[annotation_field].type 5789 or "String" 5790 ) 5791 parquet_hdr_vcf_header_infos_description = ( 5792 parquet_hdr_vcf_header_infos[annotation_field].desc 5793 or f"{annotation_field} description" 5794 ) 5795 parquet_hdr_vcf_header_infos_source = ( 5796 parquet_hdr_vcf_header_infos[annotation_field].source 5797 or "unknown" 5798 ) 5799 parquet_hdr_vcf_header_infos_version = ( 5800 parquet_hdr_vcf_header_infos[annotation_field].version 5801 or "unknown" 5802 ) 5803 5804 vcf_reader.infos[annotation_fields_new_name] = ( 5805 vcf.parser._Info( 5806 annotation_fields_new_name, 5807 parquet_hdr_vcf_header_infos_number, 5808 parquet_hdr_vcf_header_infos_type, 5809 parquet_hdr_vcf_header_infos_description, 5810 parquet_hdr_vcf_header_infos_source, 5811 parquet_hdr_vcf_header_infos_version, 5812 self.code_type_map[ 5813 parquet_hdr_vcf_header_infos_type 5814 ], 5815 ) 5816 ) 5817 5818 # Append 5819 if force_append_annotation: 5820 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5821 else: 5822 query_case_when_append = "" 5823 5824 # Annotation/Update query fields 5825 # Found in INFO column 5826 if ( 5827 annotation_field_column == "INFO" 5828 and "INFO" in parquet_hdr_vcf_header_columns 5829 ): 5830 sql_query_annotation_update_info_sets.append( 5831 f""" 5832 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5833 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5834 ELSE '' 5835 END 5836 """ 5837 ) 5838 # Found in a specific column 5839 else: 5840 sql_query_annotation_update_info_sets.append( 5841 f""" 5842 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 5843 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 5844 ELSE '' 5845 END 5846 """ 5847 ) 5848 sql_query_annotation_to_agregate.append( 5849 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5850 ) 5851 5852 # Not to annotate 5853 else: 5854 5855 if force_update_annotation: 5856 annotation_message = "forced" 5857 else: 5858 annotation_message = "skipped" 5859 5860 if annotation_field not in parquet_hdr_vcf_header_infos: 5861 log.warning( 5862 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5863 ) 5864 if annotation_fields_new_name in self.get_header().infos: 5865 log.warning( 5866 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5867 ) 5868 5869 # Check if ALL fields have to be annotated. Thus concat all INFO field 5870 # allow_annotation_full_info = True 5871 allow_annotation_full_info = not force_append_annotation 5872 5873 if parquet_type in ["regions"]: 5874 allow_annotation_full_info = False 5875 5876 if ( 5877 allow_annotation_full_info 5878 and nb_annotation_field == len(annotation_fields) 5879 and annotation_fields_all 5880 and ( 5881 "INFO" in parquet_hdr_vcf_header_columns 5882 and "INFO" in database.get_extra_columns() 5883 ) 5884 ): 5885 log.debug("Column INFO annotation enabled") 5886 sql_query_annotation_update_info_sets = [] 5887 sql_query_annotation_update_info_sets.append( 5888 f" table_parquet.INFO " 5889 ) 5890 5891 if sql_query_annotation_update_info_sets: 5892 5893 # Annotate 5894 log.info(f"Annotation '{annotation_name}' - Annotation...") 5895 5896 # Join query annotation update info sets for SQL 5897 sql_query_annotation_update_info_sets_sql = ",".join( 5898 sql_query_annotation_update_info_sets 5899 ) 5900 5901 # Check chromosomes list (and variants infos) 5902 sql_query_chromosomes = f""" 5903 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5904 FROM {table_variants} as table_variants 5905 GROUP BY table_variants."#CHROM" 5906 ORDER BY table_variants."#CHROM" 5907 """ 5908 sql_query_chromosomes_df = self.conn.execute( 5909 sql_query_chromosomes 5910 ).df() 5911 sql_query_chromosomes_dict = { 5912 entry["CHROM"]: { 5913 "count": entry["count_variants"], 5914 "min": entry["min_variants"], 5915 "max": entry["max_variants"], 5916 } 5917 for index, entry in sql_query_chromosomes_df.iterrows() 5918 } 5919 5920 # Init 5921 nb_of_query = 0 5922 nb_of_variant_annotated = 0 5923 query_dict = query_dict_remove 5924 5925 # for chrom in sql_query_chromosomes_df["CHROM"]: 5926 for chrom in sql_query_chromosomes_dict: 5927 5928 # Number of variant by chromosome 5929 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5930 chrom, {} 5931 ).get("count", 0) 5932 5933 log.debug( 5934 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5935 ) 5936 5937 # Annotation with regions database 5938 if parquet_type in ["regions"]: 5939 sql_query_annotation_from_clause = f""" 5940 FROM ( 5941 SELECT 5942 '{chrom}' AS \"#CHROM\", 5943 table_variants_from.\"POS\" AS \"POS\", 5944 {",".join(sql_query_annotation_to_agregate)} 5945 FROM {table_variants} as table_variants_from 5946 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5947 table_parquet_from."#CHROM" = '{chrom}' 5948 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5949 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5950 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5951 ) 5952 ) 5953 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5954 GROUP BY table_variants_from.\"POS\" 5955 ) 5956 as table_parquet 5957 """ 5958 5959 sql_query_annotation_where_clause = """ 5960 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5961 AND table_parquet.\"POS\" = table_variants.\"POS\" 5962 """ 5963 5964 # Annotation with variants database 5965 else: 5966 sql_query_annotation_from_clause = f""" 5967 FROM {parquet_file_link} as table_parquet 5968 """ 5969 sql_query_annotation_where_clause = f""" 5970 table_variants."#CHROM" = '{chrom}' 5971 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5972 AND table_parquet.\"POS\" = table_variants.\"POS\" 5973 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5974 AND table_parquet.\"REF\" = table_variants.\"REF\" 5975 """ 5976 5977 # Create update query 5978 sql_query_annotation_chrom_interval_pos = f""" 5979 UPDATE {table_variants} as table_variants 5980 SET INFO = 5981 concat( 5982 CASE WHEN table_variants.INFO NOT IN ('','.') 5983 THEN table_variants.INFO 5984 ELSE '' 5985 END 5986 , 5987 CASE WHEN table_variants.INFO NOT IN ('','.') 5988 AND ( 5989 concat({sql_query_annotation_update_info_sets_sql}) 5990 ) 5991 NOT IN ('','.') 5992 THEN ';' 5993 ELSE '' 5994 END 5995 , 5996 {sql_query_annotation_update_info_sets_sql} 5997 ) 5998 {sql_query_annotation_from_clause} 5999 WHERE {sql_query_annotation_where_clause} 6000 ; 6001 """ 6002 6003 # Add update query to dict 6004 query_dict[ 6005 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6006 ] = sql_query_annotation_chrom_interval_pos 6007 6008 nb_of_query = len(query_dict) 6009 num_query = 0 6010 6011 # SET max_expression_depth TO x 6012 self.conn.execute("SET max_expression_depth TO 10000") 6013 6014 for query_name in query_dict: 6015 query = query_dict[query_name] 6016 num_query += 1 6017 log.info( 6018 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6019 ) 6020 result = self.conn.execute(query) 6021 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6022 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6023 log.info( 6024 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6025 ) 6026 6027 log.info( 6028 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6029 ) 6030 6031 else: 6032 6033 log.info( 6034 f"Annotation '{annotation_name}' - No Annotations available" 6035 ) 6036 6037 log.debug("Final header: " + str(vcf_reader.infos)) 6038 6039 # Remove added columns 6040 for added_column in added_columns: 6041 self.drop_column(column=added_column)
It takes a VCF file, and annotates it with a parquet file
Parameters
- threads: number of threads to use for the annotation
Returns
the value of the variable "result".
6043 def annotation_splice(self, threads: int = None) -> None: 6044 """ 6045 This function annotate with snpEff 6046 6047 :param threads: The number of threads to use 6048 :return: the value of the variable "return_value". 6049 """ 6050 6051 # DEBUG 6052 log.debug("Start annotation with splice tools") 6053 6054 # Threads 6055 if not threads: 6056 threads = self.get_threads() 6057 log.debug("Threads: " + str(threads)) 6058 6059 # DEBUG 6060 delete_tmp = True 6061 if self.get_config().get("verbosity", "warning") in ["debug"]: 6062 delete_tmp = False 6063 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6064 6065 # Config 6066 config = self.get_config() 6067 log.debug("Config: " + str(config)) 6068 splice_config = config.get("tools", {}).get("splice", {}) 6069 if not splice_config: 6070 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6071 if not splice_config: 6072 msg_err = "No Splice tool config" 6073 log.error(msg_err) 6074 raise ValueError(msg_err) 6075 log.debug(f"splice_config={splice_config}") 6076 6077 # Config - Folders - Databases 6078 databases_folders = ( 6079 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6080 ) 6081 log.debug("Databases annotations: " + str(databases_folders)) 6082 6083 # Splice docker image 6084 splice_docker_image = splice_config.get("docker").get("image") 6085 6086 # Pull splice image if it's not already there 6087 if not check_docker_image_exists(splice_docker_image): 6088 log.warning( 6089 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6090 ) 6091 try: 6092 command(f"docker pull {splice_config.get('docker').get('image')}") 6093 except subprocess.CalledProcessError: 6094 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6095 log.error(msg_err) 6096 raise ValueError(msg_err) 6097 return None 6098 6099 # Config - splice databases 6100 splice_databases = ( 6101 config.get("folders", {}) 6102 .get("databases", {}) 6103 .get("splice", DEFAULT_SPLICE_FOLDER) 6104 ) 6105 splice_databases = full_path(splice_databases) 6106 6107 # Param 6108 param = self.get_param() 6109 log.debug("Param: " + str(param)) 6110 6111 # Param 6112 options = param.get("annotation", {}).get("splice", {}) 6113 log.debug("Options: " + str(options)) 6114 6115 # Data 6116 table_variants = self.get_table_variants() 6117 6118 # Check if not empty 6119 log.debug("Check if not empty") 6120 sql_query_chromosomes = ( 6121 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6122 ) 6123 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6124 log.info("VCF empty") 6125 return None 6126 6127 # Export in VCF 6128 log.debug("Create initial file to annotate") 6129 6130 # Create output folder 6131 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6132 if not os.path.exists(output_folder): 6133 Path(output_folder).mkdir(parents=True, exist_ok=True) 6134 6135 # Create tmp VCF file 6136 tmp_vcf = NamedTemporaryFile( 6137 prefix=self.get_prefix(), 6138 dir=output_folder, 6139 suffix=".vcf", 6140 delete=False, 6141 ) 6142 tmp_vcf_name = tmp_vcf.name 6143 6144 # VCF header 6145 header = self.get_header() 6146 6147 # Existing annotations 6148 for vcf_annotation in self.get_header().infos: 6149 6150 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6151 log.debug( 6152 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6153 ) 6154 6155 # Memory limit 6156 if config.get("memory", None): 6157 memory_limit = config.get("memory", "8G").upper() 6158 # upper() 6159 else: 6160 memory_limit = "8G" 6161 log.debug(f"memory_limit: {memory_limit}") 6162 6163 # Check number of variants to annotate 6164 where_clause_regex_spliceai = r"SpliceAI_\w+" 6165 where_clause_regex_spip = r"SPiP_\w+" 6166 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6167 df_list_of_variants_to_annotate = self.get_query_to_df( 6168 query=f""" SELECT * FROM variants {where_clause} """ 6169 ) 6170 if len(df_list_of_variants_to_annotate) == 0: 6171 log.warning( 6172 f"No variants to annotate with splice. Variants probably already annotated with splice" 6173 ) 6174 return None 6175 else: 6176 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6177 6178 # Export VCF file 6179 self.export_variant_vcf( 6180 vcf_file=tmp_vcf_name, 6181 remove_info=True, 6182 add_samples=True, 6183 index=False, 6184 where_clause=where_clause, 6185 ) 6186 6187 # Create docker container and launch splice analysis 6188 if splice_config: 6189 6190 # Splice mount folders 6191 mount_folders = splice_config.get("mount", {}) 6192 6193 # Genome mount 6194 mount_folders[ 6195 config.get("folders", {}) 6196 .get("databases", {}) 6197 .get("genomes", DEFAULT_GENOME_FOLDER) 6198 ] = "ro" 6199 6200 # SpliceAI mount 6201 mount_folders[ 6202 config.get("folders", {}) 6203 .get("databases", {}) 6204 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6205 ] = "ro" 6206 6207 # Genome mount 6208 mount_folders[ 6209 config.get("folders", {}) 6210 .get("databases", {}) 6211 .get("spip", DEFAULT_SPIP_FOLDER) 6212 ] = "ro" 6213 6214 # Mount folders 6215 mount = [] 6216 6217 # Config mount 6218 mount = [ 6219 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6220 for path, mode in mount_folders.items() 6221 ] 6222 6223 if any(value for value in splice_config.values() if value is None): 6224 log.warning("At least one splice config parameter is empty") 6225 return None 6226 6227 # Params in splice nf 6228 def check_values(dico: dict): 6229 """ 6230 Ensure parameters for NF splice pipeline 6231 """ 6232 for key, val in dico.items(): 6233 if key == "genome": 6234 if any( 6235 assemb in options.get("genome", {}) 6236 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6237 ): 6238 yield f"--{key} hg19" 6239 elif any( 6240 assemb in options.get("genome", {}) 6241 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6242 ): 6243 yield f"--{key} hg38" 6244 elif ( 6245 (isinstance(val, str) and val) 6246 or isinstance(val, int) 6247 or isinstance(val, bool) 6248 ): 6249 yield f"--{key} {val}" 6250 6251 # Genome 6252 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6253 options["genome"] = genome 6254 6255 # NF params 6256 nf_params = [] 6257 6258 # Add options 6259 if options: 6260 nf_params = list(check_values(options)) 6261 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6262 else: 6263 log.debug("No NF params provided") 6264 6265 # Add threads 6266 if "threads" not in options.keys(): 6267 nf_params.append(f"--threads {threads}") 6268 6269 # Genome path 6270 genome_path = find_genome( 6271 config.get("folders", {}) 6272 .get("databases", {}) 6273 .get("genomes", DEFAULT_GENOME_FOLDER), 6274 file=f"{genome}.fa", 6275 ) 6276 # Add genome path 6277 if not genome_path: 6278 raise ValueError( 6279 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6280 ) 6281 else: 6282 log.debug(f"Genome: {genome_path}") 6283 nf_params.append(f"--genome_path {genome_path}") 6284 6285 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6286 """ 6287 Setting up updated databases for SPiP and SpliceAI 6288 """ 6289 6290 try: 6291 6292 # SpliceAI assembly transcriptome 6293 spliceai_assembly = os.path.join( 6294 config.get("folders", {}) 6295 .get("databases", {}) 6296 .get("spliceai", {}), 6297 options.get("genome"), 6298 "transcriptome", 6299 ) 6300 spip_assembly = options.get("genome") 6301 6302 spip = find( 6303 f"transcriptome_{spip_assembly}.RData", 6304 config.get("folders", {}).get("databases", {}).get("spip", {}), 6305 ) 6306 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6307 log.debug(f"SPiP annotations: {spip}") 6308 log.debug(f"SpliceAI annotations: {spliceai}") 6309 if spip and spliceai: 6310 return [ 6311 f"--spip_transcriptome {spip}", 6312 f"--spliceai_annotations {spliceai}", 6313 ] 6314 else: 6315 # TODO crash and go on with basic annotations ? 6316 # raise ValueError( 6317 # "Can't find splice databases in configuration EXIT" 6318 # ) 6319 log.warning( 6320 "Can't find splice databases in configuration, use annotations file from image" 6321 ) 6322 except TypeError: 6323 log.warning( 6324 "Can't find splice databases in configuration, use annotations file from image" 6325 ) 6326 return [] 6327 6328 # Add options, check if transcriptome option have already beend provided 6329 if ( 6330 "spip_transcriptome" not in nf_params 6331 and "spliceai_transcriptome" not in nf_params 6332 ): 6333 splice_reference = splice_annotations(options, config) 6334 if splice_reference: 6335 nf_params.extend(splice_reference) 6336 6337 nf_params.append(f"--output_folder {output_folder}") 6338 6339 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6340 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6341 log.debug(cmd) 6342 6343 splice_config["docker"]["command"] = cmd 6344 6345 docker_cmd = get_bin_command( 6346 tool="splice", 6347 bin_type="docker", 6348 config=config, 6349 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6350 add_options=f"--name {random_uuid} {' '.join(mount)}", 6351 ) 6352 6353 # Docker debug 6354 # if splice_config.get("rm_container"): 6355 # rm_container = "--rm" 6356 # else: 6357 # rm_container = "" 6358 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6359 6360 log.debug(docker_cmd) 6361 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6362 log.debug(res.stdout) 6363 if res.stderr: 6364 log.error(res.stderr) 6365 res.check_returncode() 6366 else: 6367 log.warning(f"Splice tool configuration not found: {config}") 6368 6369 # Update variants 6370 log.info("Annotation - Updating...") 6371 # Test find output vcf 6372 log.debug( 6373 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6374 ) 6375 output_vcf = [] 6376 # Wrong folder to look in 6377 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6378 if ( 6379 files 6380 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6381 ): 6382 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6383 # log.debug(os.listdir(options.get("output_folder"))) 6384 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6385 if not output_vcf: 6386 log.debug( 6387 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6388 ) 6389 else: 6390 # Get new header from annotated vcf 6391 log.debug(f"Initial header: {len(header.infos)} fields") 6392 # Create new header with splice infos 6393 new_vcf = Variants(input=output_vcf[0]) 6394 new_vcf_header = new_vcf.get_header().infos 6395 for keys, infos in new_vcf_header.items(): 6396 if keys not in header.infos.keys(): 6397 header.infos[keys] = infos 6398 log.debug(f"New header: {len(header.infos)} fields") 6399 log.debug(f"Splice tmp output: {output_vcf[0]}") 6400 self.update_from_vcf(output_vcf[0]) 6401 6402 # Remove folder 6403 remove_if_exists(output_folder)
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
6409 def get_config_default(self, name: str) -> dict: 6410 """ 6411 The function `get_config_default` returns a dictionary containing default configurations for 6412 various calculations and prioritizations. 6413 6414 :param name: The `get_config_default` function returns a dictionary containing default 6415 configurations for different calculations and prioritizations. The `name` parameter is used to 6416 specify which specific configuration to retrieve from the dictionary 6417 :type name: str 6418 :return: The function `get_config_default` returns a dictionary containing default configuration 6419 settings for different calculations and prioritizations. The specific configuration settings are 6420 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6421 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6422 returned. If there is no match, an empty dictionary is returned. 6423 """ 6424 6425 config_default = { 6426 "calculations": { 6427 "variant_chr_pos_alt_ref": { 6428 "type": "sql", 6429 "name": "variant_chr_pos_alt_ref", 6430 "description": "Create a variant ID with chromosome, position, alt and ref", 6431 "available": False, 6432 "output_column_name": "variant_chr_pos_alt_ref", 6433 "output_column_type": "String", 6434 "output_column_description": "variant ID with chromosome, position, alt and ref", 6435 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6436 "operation_info": True, 6437 }, 6438 "VARTYPE": { 6439 "type": "sql", 6440 "name": "VARTYPE", 6441 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6442 "available": True, 6443 "output_column_name": "VARTYPE", 6444 "output_column_type": "String", 6445 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6446 "operation_query": """ 6447 CASE 6448 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6449 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6450 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6451 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6452 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6453 ELSE 'UNDEFINED' 6454 END 6455 """, 6456 "info_fields": ["SVTYPE"], 6457 "operation_info": True, 6458 }, 6459 "snpeff_hgvs": { 6460 "type": "python", 6461 "name": "snpeff_hgvs", 6462 "description": "HGVS nomenclatures from snpEff annotation", 6463 "available": True, 6464 "function_name": "calculation_extract_snpeff_hgvs", 6465 "function_params": ["snpeff_hgvs", "ANN"], 6466 }, 6467 "snpeff_ann_explode": { 6468 "type": "python", 6469 "name": "snpeff_ann_explode", 6470 "description": "Explode snpEff annotations with uniquify values", 6471 "available": True, 6472 "function_name": "calculation_snpeff_ann_explode", 6473 "function_params": [False, "fields", "snpeff_", "ANN"], 6474 }, 6475 "snpeff_ann_explode_uniquify": { 6476 "type": "python", 6477 "name": "snpeff_ann_explode_uniquify", 6478 "description": "Explode snpEff annotations", 6479 "available": True, 6480 "function_name": "calculation_snpeff_ann_explode", 6481 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6482 }, 6483 "snpeff_ann_explode_json": { 6484 "type": "python", 6485 "name": "snpeff_ann_explode_json", 6486 "description": "Explode snpEff annotations in JSON format", 6487 "available": True, 6488 "function_name": "calculation_snpeff_ann_explode", 6489 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6490 }, 6491 "NOMEN": { 6492 "type": "python", 6493 "name": "NOMEN", 6494 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6495 "available": True, 6496 "function_name": "calculation_extract_nomen", 6497 "function_params": [], 6498 }, 6499 "FINDBYPIPELINE": { 6500 "type": "python", 6501 "name": "FINDBYPIPELINE", 6502 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6503 "available": True, 6504 "function_name": "calculation_find_by_pipeline", 6505 "function_params": ["findbypipeline"], 6506 }, 6507 "FINDBYSAMPLE": { 6508 "type": "python", 6509 "name": "FINDBYSAMPLE", 6510 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6511 "available": True, 6512 "function_name": "calculation_find_by_pipeline", 6513 "function_params": ["findbysample"], 6514 }, 6515 "GENOTYPECONCORDANCE": { 6516 "type": "python", 6517 "name": "GENOTYPECONCORDANCE", 6518 "description": "Concordance of genotype for multi caller VCF", 6519 "available": True, 6520 "function_name": "calculation_genotype_concordance", 6521 "function_params": [], 6522 }, 6523 "BARCODE": { 6524 "type": "python", 6525 "name": "BARCODE", 6526 "description": "BARCODE as VaRank tool", 6527 "available": True, 6528 "function_name": "calculation_barcode", 6529 "function_params": [], 6530 }, 6531 "BARCODEFAMILY": { 6532 "type": "python", 6533 "name": "BARCODEFAMILY", 6534 "description": "BARCODEFAMILY as VaRank tool", 6535 "available": True, 6536 "function_name": "calculation_barcode_family", 6537 "function_params": ["BCF"], 6538 }, 6539 "TRIO": { 6540 "type": "python", 6541 "name": "TRIO", 6542 "description": "Inheritance for a trio family", 6543 "available": True, 6544 "function_name": "calculation_trio", 6545 "function_params": [], 6546 }, 6547 "VAF": { 6548 "type": "python", 6549 "name": "VAF", 6550 "description": "Variant Allele Frequency (VAF) harmonization", 6551 "available": True, 6552 "function_name": "calculation_vaf_normalization", 6553 "function_params": [], 6554 }, 6555 "VAF_stats": { 6556 "type": "python", 6557 "name": "VAF_stats", 6558 "description": "Variant Allele Frequency (VAF) statistics", 6559 "available": True, 6560 "function_name": "calculation_genotype_stats", 6561 "function_params": ["VAF"], 6562 }, 6563 "DP_stats": { 6564 "type": "python", 6565 "name": "DP_stats", 6566 "description": "Depth (DP) statistics", 6567 "available": True, 6568 "function_name": "calculation_genotype_stats", 6569 "function_params": ["DP"], 6570 }, 6571 "variant_id": { 6572 "type": "python", 6573 "name": "variant_id", 6574 "description": "Variant ID generated from variant position and type", 6575 "available": True, 6576 "function_name": "calculation_variant_id", 6577 "function_params": [], 6578 }, 6579 "transcripts_json": { 6580 "type": "python", 6581 "name": "transcripts_json", 6582 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6583 "available": True, 6584 "function_name": "calculation_transcripts_annotation", 6585 "function_params": ["transcripts_json", None], 6586 }, 6587 "transcripts_ann": { 6588 "type": "python", 6589 "name": "transcripts_ann", 6590 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6591 "available": True, 6592 "function_name": "calculation_transcripts_annotation", 6593 "function_params": [None, "transcripts_ann"], 6594 }, 6595 "transcripts_annotations": { 6596 "type": "python", 6597 "name": "transcripts_annotations", 6598 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6599 "available": True, 6600 "function_name": "calculation_transcripts_annotation", 6601 "function_params": [None, None], 6602 }, 6603 "transcripts_prioritization": { 6604 "type": "python", 6605 "name": "transcripts_prioritization", 6606 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6607 "available": True, 6608 "function_name": "calculation_transcripts_prioritization", 6609 "function_params": [], 6610 }, 6611 }, 6612 "prioritizations": { 6613 "default": { 6614 "ANN2": [ 6615 { 6616 "type": "contains", 6617 "value": "HIGH", 6618 "score": 5, 6619 "flag": "PASS", 6620 "comment": [ 6621 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6622 ], 6623 }, 6624 { 6625 "type": "contains", 6626 "value": "MODERATE", 6627 "score": 3, 6628 "flag": "PASS", 6629 "comment": [ 6630 "A non-disruptive variant that might change protein effectiveness" 6631 ], 6632 }, 6633 { 6634 "type": "contains", 6635 "value": "LOW", 6636 "score": 0, 6637 "flag": "FILTERED", 6638 "comment": [ 6639 "Assumed to be mostly harmless or unlikely to change protein behavior" 6640 ], 6641 }, 6642 { 6643 "type": "contains", 6644 "value": "MODIFIER", 6645 "score": 0, 6646 "flag": "FILTERED", 6647 "comment": [ 6648 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6649 ], 6650 }, 6651 ], 6652 } 6653 }, 6654 } 6655 6656 return config_default.get(name, None)
The function get_config_default returns a dictionary containing default configurations for
various calculations and prioritizations.
Parameters
- name: The
get_config_defaultfunction returns a dictionary containing default configurations for different calculations and prioritizations. Thenameparameter is used to specify which specific configuration to retrieve from the dictionary
Returns
The function
get_config_defaultreturns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the inputnameparameter provided to the function. If thenameparameter matches a key in theconfig_defaultdictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.
6658 def get_config_json( 6659 self, name: str, config_dict: dict = {}, config_file: str = None 6660 ) -> dict: 6661 """ 6662 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6663 default values, a dictionary, and a file. 6664 6665 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6666 the name of the configuration. It is used to identify and retrieve the configuration settings 6667 for a specific component or module 6668 :type name: str 6669 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6670 dictionary that allows you to provide additional configuration settings or overrides. When you 6671 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6672 the key is the configuration setting you want to override or 6673 :type config_dict: dict 6674 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6675 specify the path to a configuration file that contains additional settings. If provided, the 6676 function will read the contents of this file and update the configuration dictionary with the 6677 values found in the file, overriding any existing values with the 6678 :type config_file: str 6679 :return: The function `get_config_json` returns a dictionary containing the configuration 6680 settings. 6681 """ 6682 6683 # Create with default prioritizations 6684 config_default = self.get_config_default(name=name) 6685 configuration = config_default 6686 # log.debug(f"configuration={configuration}") 6687 6688 # Replace prioritizations from dict 6689 for config in config_dict: 6690 configuration[config] = config_dict[config] 6691 6692 # Replace prioritizations from file 6693 config_file = full_path(config_file) 6694 if config_file: 6695 if os.path.exists(config_file): 6696 with open(config_file) as config_file_content: 6697 config_file_dict = json.load(config_file_content) 6698 for config in config_file_dict: 6699 configuration[config] = config_file_dict[config] 6700 else: 6701 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6702 log.error(msg_error) 6703 raise ValueError(msg_error) 6704 6705 return configuration
The function get_config_json retrieves a configuration JSON object with prioritizations from
default values, a dictionary, and a file.
Parameters
- name: The
nameparameter in theget_config_jsonfunction is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module - config_dict: The
config_dictparameter in theget_config_jsonfunction is a dictionary that allows you to provide additional configuration settings or overrides. When you call theget_config_jsonfunction, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or - config_file: The
config_fileparameter in theget_config_jsonfunction is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns
The function
get_config_jsonreturns a dictionary containing the configuration settings.
6707 def prioritization( 6708 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6709 ) -> bool: 6710 """ 6711 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 6712 prioritizes variants based on configured profiles and criteria. 6713 6714 :param table: The `table` parameter in the `prioritization` function is used to specify the name 6715 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 6716 a table name is provided, the method will prioritize the variants in that specific table 6717 :type table: str 6718 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 6719 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 6720 provided, the code will use a default prefix value of "PZ" 6721 :type pz_prefix: str 6722 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 6723 additional parameters specific to the prioritization process. These parameters can include 6724 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 6725 configurations needed for the prioritization of variants in a V 6726 :type pz_param: dict 6727 :return: A boolean value (True) is being returned from the `prioritization` function. 6728 """ 6729 6730 # Config 6731 config = self.get_config() 6732 6733 # Param 6734 param = self.get_param() 6735 6736 # Prioritization param 6737 if pz_param is not None: 6738 prioritization_param = pz_param 6739 else: 6740 prioritization_param = param.get("prioritization", {}) 6741 6742 # Configuration profiles 6743 prioritization_config_file = prioritization_param.get( 6744 "prioritization_config", None 6745 ) 6746 prioritization_config_file = full_path(prioritization_config_file) 6747 prioritizations_config = self.get_config_json( 6748 name="prioritizations", config_file=prioritization_config_file 6749 ) 6750 6751 # Prioritization prefix 6752 pz_prefix_default = "PZ" 6753 if pz_prefix is None: 6754 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 6755 6756 # Prioritization options 6757 profiles = prioritization_param.get("profiles", []) 6758 if isinstance(profiles, str): 6759 profiles = profiles.split(",") 6760 pzfields = prioritization_param.get( 6761 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 6762 ) 6763 if isinstance(pzfields, str): 6764 pzfields = pzfields.split(",") 6765 default_profile = prioritization_param.get("default_profile", None) 6766 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 6767 prioritization_score_mode = prioritization_param.get( 6768 "prioritization_score_mode", "HOWARD" 6769 ) 6770 6771 # Quick Prioritizations 6772 prioritizations = param.get("prioritizations", None) 6773 if prioritizations: 6774 log.info("Quick Prioritization:") 6775 for profile in prioritizations.split(","): 6776 if profile not in profiles: 6777 profiles.append(profile) 6778 log.info(f" {profile}") 6779 6780 # If profile "ALL" provided, all profiles in the config profiles 6781 if "ALL" in profiles: 6782 profiles = list(prioritizations_config.keys()) 6783 6784 for profile in profiles: 6785 if prioritizations_config.get(profile, None): 6786 log.debug(f"Profile '{profile}' configured") 6787 else: 6788 msg_error = f"Profile '{profile}' NOT configured" 6789 log.error(msg_error) 6790 raise ValueError(msg_error) 6791 6792 if profiles: 6793 log.info(f"Prioritization... ") 6794 else: 6795 log.debug(f"No profile defined") 6796 return False 6797 6798 if not default_profile and len(profiles): 6799 default_profile = profiles[0] 6800 6801 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6802 log.debug("Profiles to check: " + str(list(profiles))) 6803 6804 # Variables 6805 if table is not None: 6806 table_variants = table 6807 else: 6808 table_variants = self.get_table_variants(clause="update") 6809 log.debug(f"Table to prioritize: {table_variants}") 6810 6811 # Added columns 6812 added_columns = [] 6813 6814 # Create list of PZfields 6815 # List of PZFields 6816 list_of_pzfields_original = pzfields + [ 6817 pzfield + pzfields_sep + profile 6818 for pzfield in pzfields 6819 for profile in profiles 6820 ] 6821 list_of_pzfields = [] 6822 log.debug(f"{list_of_pzfields_original}") 6823 6824 # Remove existing PZfields to use if exists 6825 for pzfield in list_of_pzfields_original: 6826 if self.get_header().infos.get(pzfield, None) is None: 6827 list_of_pzfields.append(pzfield) 6828 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6829 else: 6830 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6831 6832 if list_of_pzfields: 6833 6834 # Explode Infos prefix 6835 explode_infos_prefix = self.get_explode_infos_prefix() 6836 6837 # PZfields tags description 6838 PZfields_INFOS = { 6839 f"{pz_prefix}Tags": { 6840 "ID": f"{pz_prefix}Tags", 6841 "Number": ".", 6842 "Type": "String", 6843 "Description": "Variant tags based on annotation criteria", 6844 }, 6845 f"{pz_prefix}Score": { 6846 "ID": f"{pz_prefix}Score", 6847 "Number": 1, 6848 "Type": "Integer", 6849 "Description": "Variant score based on annotation criteria", 6850 }, 6851 f"{pz_prefix}Flag": { 6852 "ID": f"{pz_prefix}Flag", 6853 "Number": 1, 6854 "Type": "String", 6855 "Description": "Variant flag based on annotation criteria", 6856 }, 6857 f"{pz_prefix}Comment": { 6858 "ID": f"{pz_prefix}Comment", 6859 "Number": ".", 6860 "Type": "String", 6861 "Description": "Variant comment based on annotation criteria", 6862 }, 6863 f"{pz_prefix}Infos": { 6864 "ID": f"{pz_prefix}Infos", 6865 "Number": ".", 6866 "Type": "String", 6867 "Description": "Variant infos based on annotation criteria", 6868 }, 6869 f"{pz_prefix}Class": { 6870 "ID": f"{pz_prefix}Class", 6871 "Number": ".", 6872 "Type": "String", 6873 "Description": "Variant class based on annotation criteria", 6874 }, 6875 } 6876 6877 # Create INFO fields if not exist 6878 for field in PZfields_INFOS: 6879 field_ID = PZfields_INFOS[field]["ID"] 6880 field_description = PZfields_INFOS[field]["Description"] 6881 if field_ID not in self.get_header().infos and field_ID in pzfields: 6882 field_description = ( 6883 PZfields_INFOS[field]["Description"] 6884 + f", profile {default_profile}" 6885 ) 6886 self.get_header().infos[field_ID] = vcf.parser._Info( 6887 field_ID, 6888 PZfields_INFOS[field]["Number"], 6889 PZfields_INFOS[field]["Type"], 6890 field_description, 6891 "unknown", 6892 "unknown", 6893 code_type_map[PZfields_INFOS[field]["Type"]], 6894 ) 6895 6896 # Create INFO fields if not exist for each profile 6897 for profile in prioritizations_config: 6898 if profile in profiles or profiles == []: 6899 for field in PZfields_INFOS: 6900 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6901 field_description = ( 6902 PZfields_INFOS[field]["Description"] 6903 + f", profile {profile}" 6904 ) 6905 if ( 6906 field_ID not in self.get_header().infos 6907 and field in pzfields 6908 ): 6909 self.get_header().infos[field_ID] = vcf.parser._Info( 6910 field_ID, 6911 PZfields_INFOS[field]["Number"], 6912 PZfields_INFOS[field]["Type"], 6913 field_description, 6914 "unknown", 6915 "unknown", 6916 code_type_map[PZfields_INFOS[field]["Type"]], 6917 ) 6918 6919 # Header 6920 for pzfield in list_of_pzfields: 6921 if re.match(f"{pz_prefix}Score.*", pzfield): 6922 added_column = self.add_column( 6923 table_name=table_variants, 6924 column_name=pzfield, 6925 column_type="INTEGER", 6926 default_value="0", 6927 ) 6928 elif re.match(f"{pz_prefix}Flag.*", pzfield): 6929 added_column = self.add_column( 6930 table_name=table_variants, 6931 column_name=pzfield, 6932 column_type="BOOLEAN", 6933 default_value="1", 6934 ) 6935 elif re.match(f"{pz_prefix}Class.*", pzfield): 6936 added_column = self.add_column( 6937 table_name=table_variants, 6938 column_name=pzfield, 6939 column_type="VARCHAR[]", 6940 default_value="null", 6941 ) 6942 else: 6943 added_column = self.add_column( 6944 table_name=table_variants, 6945 column_name=pzfield, 6946 column_type="STRING", 6947 default_value="''", 6948 ) 6949 added_columns.append(added_column) 6950 6951 # Profiles 6952 if profiles: 6953 6954 # foreach profile in configuration file 6955 for profile in prioritizations_config: 6956 6957 # If profile is asked in param, or ALL are asked (empty profile []) 6958 if profile in profiles or profiles == []: 6959 log.info(f"Profile '{profile}'") 6960 6961 sql_set_info_option = "" 6962 6963 sql_set_info = [] 6964 6965 # PZ fields set 6966 6967 # PZScore 6968 if ( 6969 f"{pz_prefix}Score{pzfields_sep}{profile}" 6970 in list_of_pzfields 6971 ): 6972 sql_set_info.append( 6973 f""" 6974 concat( 6975 '{pz_prefix}Score{pzfields_sep}{profile}=', 6976 {pz_prefix}Score{pzfields_sep}{profile} 6977 ) 6978 """ 6979 ) 6980 if ( 6981 profile == default_profile 6982 and f"{pz_prefix}Score" in list_of_pzfields 6983 ): 6984 sql_set_info.append( 6985 f""" 6986 concat( 6987 '{pz_prefix}Score=', 6988 {pz_prefix}Score{pzfields_sep}{profile} 6989 ) 6990 """ 6991 ) 6992 6993 # PZFlag 6994 if ( 6995 f"{pz_prefix}Flag{pzfields_sep}{profile}" 6996 in list_of_pzfields 6997 ): 6998 sql_set_info.append( 6999 f""" 7000 concat( 7001 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7002 CASE 7003 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7004 THEN 'PASS' 7005 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7006 THEN 'FILTERED' 7007 END 7008 ) 7009 """ 7010 ) 7011 if ( 7012 profile == default_profile 7013 and f"{pz_prefix}Flag" in list_of_pzfields 7014 ): 7015 sql_set_info.append( 7016 f""" 7017 concat( 7018 '{pz_prefix}Flag=', 7019 CASE 7020 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7021 THEN 'PASS' 7022 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7023 THEN 'FILTERED' 7024 END 7025 ) 7026 """ 7027 ) 7028 7029 # PZClass 7030 if ( 7031 f"{pz_prefix}Class{pzfields_sep}{profile}" 7032 in list_of_pzfields 7033 ): 7034 sql_set_info.append( 7035 f""" 7036 concat( 7037 '{pz_prefix}Class{pzfields_sep}{profile}=', 7038 CASE 7039 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7040 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7041 ELSE '.' 7042 END 7043 ) 7044 7045 """ 7046 ) 7047 if ( 7048 profile == default_profile 7049 and f"{pz_prefix}Class" in list_of_pzfields 7050 ): 7051 sql_set_info.append( 7052 f""" 7053 concat( 7054 '{pz_prefix}Class=', 7055 CASE 7056 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7057 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7058 ELSE '.' 7059 END 7060 ) 7061 """ 7062 ) 7063 7064 # PZComment 7065 if ( 7066 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7067 in list_of_pzfields 7068 ): 7069 sql_set_info.append( 7070 f""" 7071 CASE 7072 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7073 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7074 ELSE '' 7075 END 7076 """ 7077 ) 7078 if ( 7079 profile == default_profile 7080 and f"{pz_prefix}Comment" in list_of_pzfields 7081 ): 7082 sql_set_info.append( 7083 f""" 7084 CASE 7085 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7086 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7087 ELSE '' 7088 END 7089 """ 7090 ) 7091 7092 # PZInfos 7093 if ( 7094 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7095 in list_of_pzfields 7096 ): 7097 sql_set_info.append( 7098 f""" 7099 CASE 7100 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7101 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7102 ELSE '' 7103 END 7104 """ 7105 ) 7106 if ( 7107 profile == default_profile 7108 and f"{pz_prefix}Infos" in list_of_pzfields 7109 ): 7110 sql_set_info.append( 7111 f""" 7112 CASE 7113 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7114 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7115 ELSE '' 7116 END 7117 """ 7118 ) 7119 7120 # Merge PZfields 7121 sql_set_info_option = "" 7122 sql_set_sep = "" 7123 for sql_set in sql_set_info: 7124 if sql_set_sep: 7125 sql_set_info_option += f""" 7126 , concat('{sql_set_sep}', {sql_set}) 7127 """ 7128 else: 7129 sql_set_info_option += f""" 7130 , {sql_set} 7131 """ 7132 sql_set_sep = ";" 7133 7134 sql_queries = [] 7135 for annotation in prioritizations_config[profile]: 7136 7137 # skip special sections 7138 if annotation.startswith("_"): 7139 continue 7140 7141 # For each criterions 7142 for criterion in prioritizations_config[profile][ 7143 annotation 7144 ]: 7145 7146 # Criterion mode 7147 criterion_mode = None 7148 if np.any( 7149 np.isin(list(criterion.keys()), ["type", "value"]) 7150 ): 7151 criterion_mode = "operation" 7152 elif np.any( 7153 np.isin(list(criterion.keys()), ["sql", "fields"]) 7154 ): 7155 criterion_mode = "sql" 7156 log.debug(f"Criterion Mode: {criterion_mode}") 7157 7158 # Criterion parameters 7159 criterion_type = criterion.get("type", None) 7160 criterion_value = criterion.get("value", None) 7161 criterion_sql = criterion.get("sql", None) 7162 criterion_fields = criterion.get("fields", None) 7163 criterion_score = criterion.get("score", 0) 7164 criterion_flag = criterion.get("flag", "PASS") 7165 criterion_class = criterion.get("class", None) 7166 criterion_flag_bool = criterion_flag == "PASS" 7167 criterion_comment = ( 7168 ", ".join(criterion.get("comment", [])) 7169 .replace("'", "''") 7170 .replace(";", ",") 7171 .replace("\t", " ") 7172 ) 7173 criterion_infos = ( 7174 str(criterion) 7175 .replace("'", "''") 7176 .replace(";", ",") 7177 .replace("\t", " ") 7178 ) 7179 7180 # SQL 7181 if criterion_sql is not None and isinstance( 7182 criterion_sql, list 7183 ): 7184 criterion_sql = " ".join(criterion_sql) 7185 7186 # Fields and explode 7187 if criterion_fields is None: 7188 criterion_fields = [annotation] 7189 if not isinstance(criterion_fields, list): 7190 criterion_fields = str(criterion_fields).split(",") 7191 7192 # Class 7193 if criterion_class is not None and not isinstance( 7194 criterion_class, list 7195 ): 7196 criterion_class = str(criterion_class).split(",") 7197 7198 for annotation_field in criterion_fields: 7199 7200 # Explode specific annotation 7201 log.debug( 7202 f"Explode annotation '{annotation_field}'" 7203 ) 7204 added_columns += self.explode_infos( 7205 prefix=explode_infos_prefix, 7206 fields=[annotation_field], 7207 table=table_variants, 7208 ) 7209 extra_infos = self.get_extra_infos( 7210 table=table_variants 7211 ) 7212 7213 # Check if annotation field is present 7214 if ( 7215 f"{explode_infos_prefix}{annotation_field}" 7216 not in extra_infos 7217 ): 7218 msq_err = f"Annotation '{annotation_field}' not in data" 7219 log.error(msq_err) 7220 raise ValueError(msq_err) 7221 else: 7222 log.debug( 7223 f"Annotation '{annotation_field}' in data" 7224 ) 7225 7226 sql_set = [] 7227 sql_set_info = [] 7228 7229 # PZ fields set 7230 7231 # PZScore 7232 if ( 7233 f"{pz_prefix}Score{pzfields_sep}{profile}" 7234 in list_of_pzfields 7235 ): 7236 # if prioritization_score_mode == "HOWARD": 7237 # sql_set.append( 7238 # f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7239 # ) 7240 # VaRank prioritization score mode 7241 if prioritization_score_mode == "VaRank": 7242 sql_set.append( 7243 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7244 ) 7245 # default HOWARD prioritization score mode 7246 else: 7247 sql_set.append( 7248 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7249 ) 7250 7251 # PZFlag 7252 if ( 7253 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7254 in list_of_pzfields 7255 ): 7256 sql_set.append( 7257 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7258 ) 7259 7260 # PZClass 7261 if ( 7262 f"{pz_prefix}Class{pzfields_sep}{profile}" 7263 in list_of_pzfields 7264 and criterion_class is not None 7265 ): 7266 sql_set.append( 7267 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7268 ) 7269 7270 # PZComment 7271 if ( 7272 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7273 in list_of_pzfields 7274 ): 7275 sql_set.append( 7276 f""" 7277 {pz_prefix}Comment{pzfields_sep}{profile} = 7278 concat( 7279 {pz_prefix}Comment{pzfields_sep}{profile}, 7280 CASE 7281 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7282 THEN ', ' 7283 ELSE '' 7284 END, 7285 '{criterion_comment}' 7286 ) 7287 """ 7288 ) 7289 7290 # PZInfos 7291 if ( 7292 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7293 in list_of_pzfields 7294 ): 7295 sql_set.append( 7296 f""" 7297 {pz_prefix}Infos{pzfields_sep}{profile} = 7298 concat( 7299 {pz_prefix}Infos{pzfields_sep}{profile}, 7300 '{criterion_infos}' 7301 ) 7302 """ 7303 ) 7304 sql_set_option = ",".join(sql_set) 7305 7306 # Criterion and comparison 7307 if sql_set_option: 7308 7309 if criterion_mode in ["operation"]: 7310 7311 try: 7312 float(criterion_value) 7313 sql_update = f""" 7314 UPDATE {table_variants} 7315 SET {sql_set_option} 7316 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7317 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7318 """ 7319 except: 7320 contains_option = "" 7321 if criterion_type == "contains": 7322 contains_option = ".*" 7323 sql_update = f""" 7324 UPDATE {table_variants} 7325 SET {sql_set_option} 7326 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7327 """ 7328 sql_queries.append(sql_update) 7329 7330 elif criterion_mode in ["sql"]: 7331 7332 sql_update = f""" 7333 UPDATE {table_variants} 7334 SET {sql_set_option} 7335 WHERE {criterion_sql} 7336 """ 7337 sql_queries.append(sql_update) 7338 7339 else: 7340 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7341 log.error(msg_err) 7342 raise ValueError(msg_err) 7343 7344 else: 7345 log.warning( 7346 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7347 ) 7348 7349 # PZTags 7350 if ( 7351 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7352 in list_of_pzfields 7353 ): 7354 7355 # Create PZFalgs value 7356 pztags_value = "" 7357 pztags_sep_default = "," 7358 pztags_sep = "" 7359 for pzfield in pzfields: 7360 if pzfield not in [f"{pz_prefix}Tags"]: 7361 if ( 7362 f"{pzfield}{pzfields_sep}{profile}" 7363 in list_of_pzfields 7364 ): 7365 if pzfield in [f"{pz_prefix}Flag"]: 7366 pztags_value += f"""{pztags_sep}{pzfield}#', 7367 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7368 THEN 'PASS' 7369 ELSE 'FILTERED' 7370 END, '""" 7371 elif pzfield in [f"{pz_prefix}Class"]: 7372 pztags_value += f"""{pztags_sep}{pzfield}#', 7373 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7374 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7375 ELSE '.' 7376 END, '""" 7377 else: 7378 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7379 pztags_sep = pztags_sep_default 7380 7381 # Add Query update for PZFlags 7382 sql_update_pztags = f""" 7383 UPDATE {table_variants} 7384 SET INFO = concat( 7385 INFO, 7386 CASE WHEN INFO NOT in ('','.') 7387 THEN ';' 7388 ELSE '' 7389 END, 7390 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7391 ) 7392 """ 7393 sql_queries.append(sql_update_pztags) 7394 7395 # Add Query update for PZFlags for default 7396 if profile == default_profile: 7397 sql_update_pztags_default = f""" 7398 UPDATE {table_variants} 7399 SET INFO = concat( 7400 INFO, 7401 ';', 7402 '{pz_prefix}Tags={pztags_value}' 7403 ) 7404 """ 7405 sql_queries.append(sql_update_pztags_default) 7406 7407 log.info(f"""Profile '{profile}' - Prioritization... """) 7408 7409 if sql_queries: 7410 7411 for sql_query in sql_queries: 7412 log.debug( 7413 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7414 ) 7415 self.conn.execute(sql_query) 7416 7417 log.info(f"""Profile '{profile}' - Update... """) 7418 sql_query_update = f""" 7419 UPDATE {table_variants} 7420 SET INFO = 7421 concat( 7422 CASE 7423 WHEN INFO NOT IN ('','.') 7424 THEN concat(INFO, ';') 7425 ELSE '' 7426 END 7427 {sql_set_info_option} 7428 ) 7429 """ 7430 self.conn.execute(sql_query_update) 7431 7432 else: 7433 7434 log.warning(f"No profiles in parameters") 7435 7436 # Remove added columns 7437 for added_column in added_columns: 7438 self.drop_column(column=added_column) 7439 7440 # Explode INFOS fields into table fields 7441 if self.get_explode_infos(): 7442 self.explode_infos( 7443 prefix=self.get_explode_infos_prefix(), 7444 fields=self.get_explode_infos_fields(), 7445 force=True, 7446 ) 7447 7448 return True
The prioritization function in Python processes VCF files, adds new INFO fields, and
prioritizes variants based on configured profiles and criteria.
Parameters
- table: The
tableparameter in theprioritizationfunction is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table - pz_prefix: The
pz_prefixparameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ" - pz_param: The
pz_paramparameter in theprioritizationmethod is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns
A boolean value (True) is being returned from the
prioritizationfunction.
7454 def annotation_hgvs(self, threads: int = None) -> None: 7455 """ 7456 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7457 coordinates and alleles. 7458 7459 :param threads: The `threads` parameter is an optional integer that specifies the number of 7460 threads to use for parallel processing. If no value is provided, it will default to the number 7461 of threads obtained from the `get_threads()` method 7462 :type threads: int 7463 """ 7464 7465 # Function for each partition of the Dask Dataframe 7466 def partition_function(partition): 7467 """ 7468 The function `partition_function` applies the `annotation_hgvs_partition` function to 7469 each row of a DataFrame called `partition`. 7470 7471 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7472 to be processed 7473 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7474 the "partition" dataframe along the axis 1. 7475 """ 7476 return partition.apply(annotation_hgvs_partition, axis=1) 7477 7478 def annotation_hgvs_partition(row) -> str: 7479 """ 7480 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7481 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7482 7483 :param row: A dictionary-like object that contains the values for the following keys: 7484 :return: a string that contains the HGVS names associated with the given row of data. 7485 """ 7486 7487 chr = row["CHROM"] 7488 pos = row["POS"] 7489 ref = row["REF"] 7490 alt = row["ALT"] 7491 7492 # Find list of associated transcripts 7493 transcripts_list = list( 7494 polars_conn.execute( 7495 f""" 7496 SELECT transcript 7497 FROM refseq_df 7498 WHERE CHROM='{chr}' 7499 AND POS={pos} 7500 """ 7501 )["transcript"] 7502 ) 7503 7504 # Full HGVS annotation in list 7505 hgvs_full_list = [] 7506 7507 for transcript_name in transcripts_list: 7508 7509 # Transcript 7510 transcript = get_transcript( 7511 transcripts=transcripts, transcript_name=transcript_name 7512 ) 7513 # Exon 7514 if use_exon: 7515 exon = transcript.find_exon_number(pos) 7516 else: 7517 exon = None 7518 # Protein 7519 transcript_protein = None 7520 if use_protein or add_protein or full_format: 7521 transcripts_protein = list( 7522 polars_conn.execute( 7523 f""" 7524 SELECT protein 7525 FROM refseqlink_df 7526 WHERE transcript='{transcript_name}' 7527 LIMIT 1 7528 """ 7529 )["protein"] 7530 ) 7531 if len(transcripts_protein): 7532 transcript_protein = transcripts_protein[0] 7533 7534 # HGVS name 7535 hgvs_name = format_hgvs_name( 7536 chr, 7537 pos, 7538 ref, 7539 alt, 7540 genome=genome, 7541 transcript=transcript, 7542 transcript_protein=transcript_protein, 7543 exon=exon, 7544 use_gene=use_gene, 7545 use_protein=use_protein, 7546 full_format=full_format, 7547 use_version=use_version, 7548 codon_type=codon_type, 7549 ) 7550 hgvs_full_list.append(hgvs_name) 7551 if add_protein and not use_protein and not full_format: 7552 hgvs_name = format_hgvs_name( 7553 chr, 7554 pos, 7555 ref, 7556 alt, 7557 genome=genome, 7558 transcript=transcript, 7559 transcript_protein=transcript_protein, 7560 exon=exon, 7561 use_gene=use_gene, 7562 use_protein=True, 7563 full_format=False, 7564 use_version=use_version, 7565 codon_type=codon_type, 7566 ) 7567 hgvs_full_list.append(hgvs_name) 7568 7569 # Create liste of HGVS annotations 7570 hgvs_full = ",".join(hgvs_full_list) 7571 7572 return hgvs_full 7573 7574 # Polars connexion 7575 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7576 7577 # Config 7578 config = self.get_config() 7579 7580 # Databases 7581 # Genome 7582 databases_genomes_folders = ( 7583 config.get("folders", {}) 7584 .get("databases", {}) 7585 .get("genomes", DEFAULT_GENOME_FOLDER) 7586 ) 7587 databases_genome = ( 7588 config.get("folders", {}).get("databases", {}).get("genomes", "") 7589 ) 7590 # refseq database folder 7591 databases_refseq_folders = ( 7592 config.get("folders", {}) 7593 .get("databases", {}) 7594 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7595 ) 7596 # refseq 7597 databases_refseq = config.get("databases", {}).get("refSeq", None) 7598 # refSeqLink 7599 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7600 7601 # Param 7602 param = self.get_param() 7603 7604 # Quick HGVS 7605 if "hgvs_options" in param and param.get("hgvs_options", ""): 7606 log.info(f"Quick HGVS Annotation:") 7607 if not param.get("hgvs", None): 7608 param["hgvs"] = {} 7609 for option in param.get("hgvs_options", "").split(","): 7610 option_var_val = option.split("=") 7611 option_var = option_var_val[0] 7612 if len(option_var_val) > 1: 7613 option_val = option_var_val[1] 7614 else: 7615 option_val = "True" 7616 if option_val.upper() in ["TRUE"]: 7617 option_val = True 7618 elif option_val.upper() in ["FALSE"]: 7619 option_val = False 7620 log.info(f" {option_var}={option_val}") 7621 param["hgvs"][option_var] = option_val 7622 7623 # Check if HGVS annotation enabled 7624 if "hgvs" in param: 7625 log.info(f"HGVS Annotation... ") 7626 for hgvs_option in param.get("hgvs", {}): 7627 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7628 else: 7629 return 7630 7631 # HGVS Param 7632 param_hgvs = param.get("hgvs", {}) 7633 use_exon = param_hgvs.get("use_exon", False) 7634 use_gene = param_hgvs.get("use_gene", False) 7635 use_protein = param_hgvs.get("use_protein", False) 7636 add_protein = param_hgvs.get("add_protein", False) 7637 full_format = param_hgvs.get("full_format", False) 7638 use_version = param_hgvs.get("use_version", False) 7639 codon_type = param_hgvs.get("codon_type", "3") 7640 7641 # refSseq refSeqLink 7642 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7643 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7644 7645 # Assembly 7646 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7647 7648 # Genome 7649 genome_file = None 7650 if find_genome(databases_genome): 7651 genome_file = find_genome(databases_genome) 7652 else: 7653 genome_file = find_genome( 7654 genome_path=databases_genomes_folders, assembly=assembly 7655 ) 7656 log.debug("Genome: " + str(genome_file)) 7657 7658 # refSseq 7659 refseq_file = find_file_prefix( 7660 input_file=databases_refseq, 7661 prefix="ncbiRefSeq", 7662 folder=databases_refseq_folders, 7663 assembly=assembly, 7664 ) 7665 log.debug("refSeq: " + str(refseq_file)) 7666 7667 # refSeqLink 7668 refseqlink_file = find_file_prefix( 7669 input_file=databases_refseqlink, 7670 prefix="ncbiRefSeqLink", 7671 folder=databases_refseq_folders, 7672 assembly=assembly, 7673 ) 7674 log.debug("refSeqLink: " + str(refseqlink_file)) 7675 7676 # Threads 7677 if not threads: 7678 threads = self.get_threads() 7679 log.debug("Threads: " + str(threads)) 7680 7681 # Variables 7682 table_variants = self.get_table_variants(clause="update") 7683 7684 # Get variants SNV and InDel only 7685 query_variants = f""" 7686 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7687 FROM {table_variants} 7688 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7689 """ 7690 df_variants = self.get_query_to_df(query_variants) 7691 7692 # Added columns 7693 added_columns = [] 7694 7695 # Add hgvs column in variants table 7696 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7697 added_column = self.add_column( 7698 table_variants, hgvs_column_name, "STRING", default_value=None 7699 ) 7700 added_columns.append(added_column) 7701 7702 log.debug(f"refSeq loading...") 7703 # refSeq in duckDB 7704 refseq_table = get_refseq_table( 7705 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7706 ) 7707 # Loading all refSeq in Dataframe 7708 refseq_query = f""" 7709 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7710 FROM {refseq_table} 7711 JOIN df_variants ON ( 7712 {refseq_table}.chrom = df_variants.CHROM 7713 AND {refseq_table}.txStart<=df_variants.POS 7714 AND {refseq_table}.txEnd>=df_variants.POS 7715 ) 7716 """ 7717 refseq_df = self.conn.query(refseq_query).pl() 7718 7719 if refseqlink_file: 7720 log.debug(f"refSeqLink loading...") 7721 # refSeqLink in duckDB 7722 refseqlink_table = get_refseq_table( 7723 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7724 ) 7725 # Loading all refSeqLink in Dataframe 7726 protacc_column = "protAcc_with_ver" 7727 mrnaacc_column = "mrnaAcc_with_ver" 7728 refseqlink_query = f""" 7729 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7730 FROM {refseqlink_table} 7731 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7732 WHERE protAcc_without_ver IS NOT NULL 7733 """ 7734 # Polars Dataframe 7735 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7736 7737 # Read RefSeq transcripts into a python dict/model. 7738 log.debug(f"Transcripts loading...") 7739 with tempfile.TemporaryDirectory() as tmpdir: 7740 transcripts_query = f""" 7741 COPY ( 7742 SELECT {refseq_table}.* 7743 FROM {refseq_table} 7744 JOIN df_variants ON ( 7745 {refseq_table}.chrom=df_variants.CHROM 7746 AND {refseq_table}.txStart<=df_variants.POS 7747 AND {refseq_table}.txEnd>=df_variants.POS 7748 ) 7749 ) 7750 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7751 """ 7752 self.conn.query(transcripts_query) 7753 with open(f"{tmpdir}/transcript.tsv") as infile: 7754 transcripts = read_transcripts(infile) 7755 7756 # Polars connexion 7757 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7758 7759 log.debug("Genome loading...") 7760 # Read genome sequence using pyfaidx. 7761 genome = Fasta(genome_file) 7762 7763 log.debug("Start annotation HGVS...") 7764 7765 # Create 7766 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7767 ddf = dd.from_pandas(df_variants, npartitions=threads) 7768 7769 # Use dask.dataframe.apply() to apply function on each partition 7770 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7771 7772 # Convert Dask DataFrame to Pandas Dataframe 7773 df = ddf.compute() 7774 7775 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7776 with tempfile.TemporaryDirectory() as tmpdir: 7777 df_parquet = os.path.join(tmpdir, "df.parquet") 7778 df.to_parquet(df_parquet) 7779 7780 # Update hgvs column 7781 update_variant_query = f""" 7782 UPDATE {table_variants} 7783 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7784 FROM read_parquet('{df_parquet}') as df 7785 WHERE variants."#CHROM" = df.CHROM 7786 AND variants.POS = df.POS 7787 AND variants.REF = df.REF 7788 AND variants.ALT = df.ALT 7789 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7790 """ 7791 self.execute_query(update_variant_query) 7792 7793 # Update INFO column 7794 sql_query_update = f""" 7795 UPDATE {table_variants} 7796 SET INFO = 7797 concat( 7798 CASE 7799 WHEN INFO NOT IN ('','.') 7800 THEN concat(INFO, ';') 7801 ELSE '' 7802 END, 7803 'hgvs=', 7804 {hgvs_column_name} 7805 ) 7806 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7807 """ 7808 self.execute_query(sql_query_update) 7809 7810 # Add header 7811 HGVS_INFOS = { 7812 "hgvs": { 7813 "ID": "hgvs", 7814 "Number": ".", 7815 "Type": "String", 7816 "Description": f"HGVS annotatation with HOWARD", 7817 } 7818 } 7819 7820 for field in HGVS_INFOS: 7821 field_ID = HGVS_INFOS[field]["ID"] 7822 field_description = HGVS_INFOS[field]["Description"] 7823 self.get_header().infos[field_ID] = vcf.parser._Info( 7824 field_ID, 7825 HGVS_INFOS[field]["Number"], 7826 HGVS_INFOS[field]["Type"], 7827 field_description, 7828 "unknown", 7829 "unknown", 7830 code_type_map[HGVS_INFOS[field]["Type"]], 7831 ) 7832 7833 # Remove added columns 7834 for added_column in added_columns: 7835 self.drop_column(column=added_column)
The annotation_hgvs function performs HGVS annotation on a set of variants using genomic
coordinates and alleles.
Parameters
- threads: The
threadsparameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from theget_threads()method
7841 def get_operations_help( 7842 self, operations_config_dict: dict = {}, operations_config_file: str = None 7843 ) -> list: 7844 7845 # Init 7846 operations_help = [] 7847 7848 # operations 7849 operations = self.get_config_json( 7850 name="calculations", 7851 config_dict=operations_config_dict, 7852 config_file=operations_config_file, 7853 ) 7854 for op in operations: 7855 op_name = operations[op].get("name", op).upper() 7856 op_description = operations[op].get("description", op_name) 7857 op_available = operations[op].get("available", False) 7858 if op_available: 7859 operations_help.append(f" {op_name}: {op_description}") 7860 7861 # Sort operations 7862 operations_help.sort() 7863 7864 # insert header 7865 operations_help.insert(0, "Available calculation operations:") 7866 7867 # Return 7868 return operations_help
7870 def calculation( 7871 self, 7872 operations: dict = {}, 7873 operations_config_dict: dict = {}, 7874 operations_config_file: str = None, 7875 ) -> None: 7876 """ 7877 It takes a list of operations, and for each operation, it checks if it's a python or sql 7878 operation, and then calls the appropriate function 7879 7880 param json example: 7881 "calculation": { 7882 "NOMEN": { 7883 "options": { 7884 "hgvs_field": "hgvs" 7885 }, 7886 "middle" : null 7887 } 7888 """ 7889 7890 # Param 7891 param = self.get_param() 7892 7893 # operations config 7894 operations_config = self.get_config_json( 7895 name="calculations", 7896 config_dict=operations_config_dict, 7897 config_file=operations_config_file, 7898 ) 7899 7900 # Upper keys 7901 operations_config = {k.upper(): v for k, v in operations_config.items()} 7902 7903 # Calculations 7904 7905 # Operations from param 7906 operations = param.get("calculation", {}).get("calculations", operations) 7907 7908 # Quick calculation - add 7909 if param.get("calculations", None): 7910 calculations_list = [ 7911 value for value in param.get("calculations", "").split(",") 7912 ] 7913 log.info(f"Quick Calculations:") 7914 for calculation_key in calculations_list: 7915 log.info(f" {calculation_key}") 7916 for calculation_operation in calculations_list: 7917 if calculation_operation.upper() not in operations: 7918 operations[calculation_operation.upper()] = {} 7919 add_value_into_dict( 7920 dict_tree=param, 7921 sections=[ 7922 "calculation", 7923 "calculations", 7924 calculation_operation.upper(), 7925 ], 7926 value={}, 7927 ) 7928 7929 # Operations for calculation 7930 if not operations: 7931 operations = param.get("calculation", {}).get("calculations", {}) 7932 7933 if operations: 7934 log.info(f"Calculations...") 7935 7936 # For each operations 7937 for operation_name in operations: 7938 operation_name = operation_name.upper() 7939 if operation_name not in [""]: 7940 if operation_name in operations_config: 7941 log.info(f"Calculation '{operation_name}'") 7942 operation = operations_config[operation_name] 7943 operation_type = operation.get("type", "sql") 7944 if operation_type == "python": 7945 self.calculation_process_function( 7946 operation=operation, operation_name=operation_name 7947 ) 7948 elif operation_type == "sql": 7949 self.calculation_process_sql( 7950 operation=operation, operation_name=operation_name 7951 ) 7952 else: 7953 log.error( 7954 f"Operations config: Type '{operation_type}' NOT available" 7955 ) 7956 raise ValueError( 7957 f"Operations config: Type '{operation_type}' NOT available" 7958 ) 7959 else: 7960 log.error( 7961 f"Operations config: Calculation '{operation_name}' NOT available" 7962 ) 7963 raise ValueError( 7964 f"Operations config: Calculation '{operation_name}' NOT available" 7965 ) 7966 7967 # Explode INFOS fields into table fields 7968 if self.get_explode_infos(): 7969 self.explode_infos( 7970 prefix=self.get_explode_infos_prefix(), 7971 fields=self.get_explode_infos_fields(), 7972 force=True, 7973 )
It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function
param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }
7975 def calculation_process_sql( 7976 self, operation: dict, operation_name: str = "unknown" 7977 ) -> None: 7978 """ 7979 The `calculation_process_sql` function takes in a mathematical operation as a string and 7980 performs the operation, updating the specified table with the result. 7981 7982 :param operation: The `operation` parameter is a dictionary that contains information about the 7983 mathematical operation to be performed. It includes the following keys: 7984 :type operation: dict 7985 :param operation_name: The `operation_name` parameter is a string that represents the name of 7986 the mathematical operation being performed. It is used for logging and error handling purposes, 7987 defaults to unknown 7988 :type operation_name: str (optional) 7989 """ 7990 7991 # table variants 7992 table_variants = self.get_table_variants(clause="alter") 7993 7994 # Operation infos 7995 operation_name = operation.get("name", "unknown") 7996 log.debug(f"process sql {operation_name}") 7997 output_column_name = operation.get("output_column_name", operation_name) 7998 output_column_type = operation.get("output_column_type", "String") 7999 prefix = operation.get("explode_infos_prefix", "") 8000 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8001 output_column_description = operation.get( 8002 "output_column_description", f"{operation_name} operation" 8003 ) 8004 operation_query = operation.get("operation_query", None) 8005 if isinstance(operation_query, list): 8006 operation_query = " ".join(operation_query) 8007 operation_info_fields = operation.get("info_fields", []) 8008 operation_info_fields_check = operation.get("info_fields_check", False) 8009 operation_info = operation.get("operation_info", True) 8010 8011 if operation_query: 8012 8013 # Info fields check 8014 operation_info_fields_check_result = True 8015 if operation_info_fields_check: 8016 header_infos = self.get_header().infos 8017 for info_field in operation_info_fields: 8018 operation_info_fields_check_result = ( 8019 operation_info_fields_check_result 8020 and info_field in header_infos 8021 ) 8022 8023 # If info fields available 8024 if operation_info_fields_check_result: 8025 8026 # Added_columns 8027 added_columns = [] 8028 8029 # Create VCF header field 8030 vcf_reader = self.get_header() 8031 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8032 output_column_name, 8033 ".", 8034 output_column_type, 8035 output_column_description, 8036 "howard calculation", 8037 "0", 8038 self.code_type_map.get(output_column_type), 8039 ) 8040 8041 # Explode infos if needed 8042 log.debug(f"calculation_process_sql prefix {prefix}") 8043 added_columns += self.explode_infos( 8044 prefix=prefix, 8045 fields=[output_column_name] + operation_info_fields, 8046 force=True, 8047 ) 8048 8049 # Create column 8050 added_column = self.add_column( 8051 table_name=table_variants, 8052 column_name=prefix + output_column_name, 8053 column_type=output_column_type_sql, 8054 default_value="null", 8055 ) 8056 added_columns.append(added_column) 8057 8058 # Operation calculation 8059 try: 8060 8061 # Query to update calculation column 8062 sql_update = f""" 8063 UPDATE {table_variants} 8064 SET "{prefix}{output_column_name}" = ({operation_query}) 8065 """ 8066 self.conn.execute(sql_update) 8067 8068 # Add to INFO 8069 if operation_info: 8070 sql_update_info = f""" 8071 UPDATE {table_variants} 8072 SET "INFO" = 8073 concat( 8074 CASE 8075 WHEN "INFO" IS NOT NULL 8076 THEN concat("INFO", ';') 8077 ELSE '' 8078 END, 8079 '{output_column_name}=', 8080 "{prefix}{output_column_name}" 8081 ) 8082 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8083 """ 8084 self.conn.execute(sql_update_info) 8085 8086 except: 8087 log.error( 8088 f"Operations config: Calculation '{operation_name}' query failed" 8089 ) 8090 raise ValueError( 8091 f"Operations config: Calculation '{operation_name}' query failed" 8092 ) 8093 8094 # Remove added columns 8095 for added_column in added_columns: 8096 log.debug(f"added_column: {added_column}") 8097 self.drop_column(column=added_column) 8098 8099 else: 8100 log.error( 8101 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8102 ) 8103 raise ValueError( 8104 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8105 ) 8106 8107 else: 8108 log.error( 8109 f"Operations config: Calculation '{operation_name}' query NOT defined" 8110 ) 8111 raise ValueError( 8112 f"Operations config: Calculation '{operation_name}' query NOT defined" 8113 )
The calculation_process_sql function takes in a mathematical operation as a string and
performs the operation, updating the specified table with the result.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
8115 def calculation_process_function( 8116 self, operation: dict, operation_name: str = "unknown" 8117 ) -> None: 8118 """ 8119 The `calculation_process_function` takes in an operation dictionary and performs the specified 8120 function with the given parameters. 8121 8122 :param operation: The `operation` parameter is a dictionary that contains information about the 8123 operation to be performed. It has the following keys: 8124 :type operation: dict 8125 :param operation_name: The `operation_name` parameter is a string that represents the name of 8126 the operation being performed. It is used for logging purposes, defaults to unknown 8127 :type operation_name: str (optional) 8128 """ 8129 8130 operation_name = operation["name"] 8131 log.debug(f"process sql {operation_name}") 8132 function_name = operation["function_name"] 8133 function_params = operation["function_params"] 8134 getattr(self, function_name)(*function_params)
The calculation_process_function takes in an operation dictionary and performs the specified
function with the given parameters.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the operation to be performed. It has the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
8136 def calculation_variant_id(self) -> None: 8137 """ 8138 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8139 updates the INFO field of a variants table with the variant ID. 8140 """ 8141 8142 # variant_id annotation field 8143 variant_id_tag = self.get_variant_id_column() 8144 added_columns = [variant_id_tag] 8145 8146 # variant_id hgvs tags" 8147 vcf_infos_tags = { 8148 variant_id_tag: "howard variant ID annotation", 8149 } 8150 8151 # Variants table 8152 table_variants = self.get_table_variants() 8153 8154 # Header 8155 vcf_reader = self.get_header() 8156 8157 # Add variant_id to header 8158 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8159 variant_id_tag, 8160 ".", 8161 "String", 8162 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8163 "howard calculation", 8164 "0", 8165 self.code_type_map.get("String"), 8166 ) 8167 8168 # Update 8169 sql_update = f""" 8170 UPDATE {table_variants} 8171 SET "INFO" = 8172 concat( 8173 CASE 8174 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8175 THEN '' 8176 ELSE concat("INFO", ';') 8177 END, 8178 '{variant_id_tag}=', 8179 "{variant_id_tag}" 8180 ) 8181 """ 8182 self.conn.execute(sql_update) 8183 8184 # Remove added columns 8185 for added_column in added_columns: 8186 self.drop_column(column=added_column)
The function calculation_variant_id adds a variant ID annotation to a VCF file header and
updates the INFO field of a variants table with the variant ID.
8188 def calculation_extract_snpeff_hgvs( 8189 self, 8190 snpeff_hgvs: str = "snpeff_hgvs", 8191 snpeff_field: str = "ANN", 8192 ) -> None: 8193 """ 8194 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8195 annotation field in a VCF file and adds them as a new column in the variants table. 8196 8197 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8198 function is used to specify the name of the column that will store the HGVS nomenclatures 8199 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8200 snpeff_hgvs 8201 :type snpeff_hgvs: str (optional) 8202 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8203 function represents the field in the VCF file that contains SnpEff annotations. This field is 8204 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8205 to ANN 8206 :type snpeff_field: str (optional) 8207 """ 8208 8209 # Snpeff hgvs tags 8210 vcf_infos_tags = { 8211 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8212 } 8213 8214 # Prefix 8215 prefix = self.get_explode_infos_prefix() 8216 if prefix: 8217 prefix = "INFO/" 8218 8219 # snpEff fields 8220 speff_ann_infos = prefix + snpeff_field 8221 speff_hgvs_infos = prefix + snpeff_hgvs 8222 8223 # Variants table 8224 table_variants = self.get_table_variants() 8225 8226 # Header 8227 vcf_reader = self.get_header() 8228 8229 # Add columns 8230 added_columns = [] 8231 8232 # Explode HGVS field in column 8233 added_columns += self.explode_infos(fields=[snpeff_field]) 8234 8235 if snpeff_field in vcf_reader.infos: 8236 8237 log.debug(vcf_reader.infos[snpeff_field]) 8238 8239 # Extract ANN header 8240 ann_description = vcf_reader.infos[snpeff_field].desc 8241 pattern = r"'(.+?)'" 8242 match = re.search(pattern, ann_description) 8243 if match: 8244 ann_header_match = match.group(1).split(" | ") 8245 ann_header_desc = {} 8246 for i in range(len(ann_header_match)): 8247 ann_header_info = "".join( 8248 char for char in ann_header_match[i] if char.isalnum() 8249 ) 8250 ann_header_desc[ann_header_info] = ann_header_match[i] 8251 if not ann_header_desc: 8252 raise ValueError("Invalid header description format") 8253 else: 8254 raise ValueError("Invalid header description format") 8255 8256 # Create variant id 8257 variant_id_column = self.get_variant_id_column() 8258 added_columns += [variant_id_column] 8259 8260 # Create dataframe 8261 dataframe_snpeff_hgvs = self.get_query_to_df( 8262 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8263 ) 8264 8265 # Create main NOMEN column 8266 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8267 speff_ann_infos 8268 ].apply( 8269 lambda x: extract_snpeff_hgvs( 8270 str(x), header=list(ann_header_desc.values()) 8271 ) 8272 ) 8273 8274 # Add snpeff_hgvs to header 8275 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8276 snpeff_hgvs, 8277 ".", 8278 "String", 8279 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8280 "howard calculation", 8281 "0", 8282 self.code_type_map.get("String"), 8283 ) 8284 8285 # Update 8286 sql_update = f""" 8287 UPDATE variants 8288 SET "INFO" = 8289 concat( 8290 CASE 8291 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8292 THEN '' 8293 ELSE concat("INFO", ';') 8294 END, 8295 CASE 8296 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8297 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8298 THEN concat( 8299 '{snpeff_hgvs}=', 8300 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8301 ) 8302 ELSE '' 8303 END 8304 ) 8305 FROM dataframe_snpeff_hgvs 8306 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8307 8308 """ 8309 self.conn.execute(sql_update) 8310 8311 # Delete dataframe 8312 del dataframe_snpeff_hgvs 8313 gc.collect() 8314 8315 else: 8316 8317 log.warning( 8318 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8319 ) 8320 8321 # Remove added columns 8322 for added_column in added_columns: 8323 self.drop_column(column=added_column)
The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff
annotation field in a VCF file and adds them as a new column in the variants table.
Parameters
- snpeff_hgvs: The
snpeff_hgvsparameter in thecalculation_extract_snpeff_hgvsfunction is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs - snpeff_field: The
snpeff_fieldparameter in thecalculation_extract_snpeff_hgvsfunction represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
8325 def calculation_snpeff_ann_explode( 8326 self, 8327 uniquify: bool = True, 8328 output_format: str = "fields", 8329 output_prefix: str = "snpeff_", 8330 snpeff_field: str = "ANN", 8331 ) -> None: 8332 """ 8333 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8334 exploding the HGVS field and updating variant information accordingly. 8335 8336 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8337 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8338 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8339 defaults to True 8340 :type uniquify: bool (optional) 8341 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8342 function specifies the format in which the output annotations will be generated. It has a 8343 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8344 format, defaults to fields 8345 :type output_format: str (optional) 8346 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8347 method is used to specify the prefix that will be added to the output annotations generated 8348 during the calculation process. This prefix helps to differentiate the newly added annotations 8349 from existing ones in the output data. By default, the, defaults to ANN_ 8350 :type output_prefix: str (optional) 8351 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8352 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8353 field will be processed to explode the HGVS annotations and update the variant information 8354 accordingly, defaults to ANN 8355 :type snpeff_field: str (optional) 8356 """ 8357 8358 # SnpEff annotation field 8359 snpeff_hgvs = "snpeff_ann_explode" 8360 8361 # Snpeff hgvs tags 8362 vcf_infos_tags = { 8363 snpeff_hgvs: "Explode snpEff annotations", 8364 } 8365 8366 # Prefix 8367 prefix = self.get_explode_infos_prefix() 8368 if prefix: 8369 prefix = "INFO/" 8370 8371 # snpEff fields 8372 speff_ann_infos = prefix + snpeff_field 8373 speff_hgvs_infos = prefix + snpeff_hgvs 8374 8375 # Variants table 8376 table_variants = self.get_table_variants() 8377 8378 # Header 8379 vcf_reader = self.get_header() 8380 8381 # Add columns 8382 added_columns = [] 8383 8384 # Explode HGVS field in column 8385 added_columns += self.explode_infos(fields=[snpeff_field]) 8386 log.debug(f"snpeff_field={snpeff_field}") 8387 log.debug(f"added_columns={added_columns}") 8388 8389 if snpeff_field in vcf_reader.infos: 8390 8391 # Extract ANN header 8392 ann_description = vcf_reader.infos[snpeff_field].desc 8393 pattern = r"'(.+?)'" 8394 match = re.search(pattern, ann_description) 8395 if match: 8396 ann_header_match = match.group(1).split(" | ") 8397 ann_header = [] 8398 ann_header_desc = {} 8399 for i in range(len(ann_header_match)): 8400 ann_header_info = "".join( 8401 char for char in ann_header_match[i] if char.isalnum() 8402 ) 8403 ann_header.append(ann_header_info) 8404 ann_header_desc[ann_header_info] = ann_header_match[i] 8405 if not ann_header_desc: 8406 raise ValueError("Invalid header description format") 8407 else: 8408 raise ValueError("Invalid header description format") 8409 8410 # Create variant id 8411 variant_id_column = self.get_variant_id_column() 8412 added_columns += [variant_id_column] 8413 8414 # Create dataframe 8415 dataframe_snpeff_hgvs = self.get_query_to_df( 8416 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8417 ) 8418 8419 # Create snpEff columns 8420 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8421 speff_ann_infos 8422 ].apply( 8423 lambda x: explode_snpeff_ann( 8424 str(x), 8425 uniquify=uniquify, 8426 output_format=output_format, 8427 prefix=output_prefix, 8428 header=list(ann_header_desc.values()), 8429 ) 8430 ) 8431 8432 # Header 8433 ann_annotations_prefix = "" 8434 if output_format.upper() in ["JSON"]: 8435 ann_annotations_prefix = f"{output_prefix}=" 8436 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8437 output_prefix, 8438 ".", 8439 "String", 8440 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8441 + " - JSON format", 8442 "howard calculation", 8443 "0", 8444 self.code_type_map.get("String"), 8445 ) 8446 else: 8447 for ann_annotation in ann_header: 8448 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8449 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8450 ann_annotation_id, 8451 ".", 8452 "String", 8453 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8454 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8455 "howard calculation", 8456 "0", 8457 self.code_type_map.get("String"), 8458 ) 8459 8460 # Update 8461 sql_update = f""" 8462 UPDATE variants 8463 SET "INFO" = 8464 concat( 8465 CASE 8466 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8467 THEN '' 8468 ELSE concat("INFO", ';') 8469 END, 8470 CASE 8471 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8472 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8473 THEN concat( 8474 '{ann_annotations_prefix}', 8475 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8476 ) 8477 ELSE '' 8478 END 8479 ) 8480 FROM dataframe_snpeff_hgvs 8481 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8482 8483 """ 8484 self.conn.execute(sql_update) 8485 8486 # Delete dataframe 8487 del dataframe_snpeff_hgvs 8488 gc.collect() 8489 8490 else: 8491 8492 log.warning( 8493 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8494 ) 8495 8496 # Remove added columns 8497 for added_column in added_columns: 8498 self.drop_column(column=added_column)
The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by
exploding the HGVS field and updating variant information accordingly.
Parameters
- uniquify: The
uniquifyparameter in thecalculation_snpeff_ann_explodemethod is a boolean flag that determines whether the output should be uniquified or not. When set toTrue, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True - output_format: The
output_formatparameter in thecalculation_snpeff_ann_explodefunction specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields - output_prefix: The
output_prefixparameter in thecalculation_snpeff_ann_explodemethod is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_ - snpeff_field: The
snpeff_fieldparameter in thecalculation_snpeff_ann_explodefunction is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
8500 def calculation_extract_nomen(self) -> None: 8501 """ 8502 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8503 """ 8504 8505 # NOMEN field 8506 field_nomen_dict = "NOMEN_DICT" 8507 8508 # NOMEN structure 8509 nomen_dict = { 8510 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8511 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8512 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8513 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8514 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8515 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8516 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8517 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8518 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8519 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8520 } 8521 8522 # Param 8523 param = self.get_param() 8524 8525 # Prefix 8526 prefix = self.get_explode_infos_prefix() 8527 8528 # Header 8529 vcf_reader = self.get_header() 8530 8531 # Get HGVS field 8532 hgvs_field = ( 8533 param.get("calculation", {}) 8534 .get("calculations", {}) 8535 .get("NOMEN", {}) 8536 .get("options", {}) 8537 .get("hgvs_field", "hgvs") 8538 ) 8539 8540 # Get transcripts 8541 transcripts_file = ( 8542 param.get("calculation", {}) 8543 .get("calculations", {}) 8544 .get("NOMEN", {}) 8545 .get("options", {}) 8546 .get("transcripts", None) 8547 ) 8548 transcripts_file = full_path(transcripts_file) 8549 transcripts = [] 8550 if transcripts_file: 8551 if os.path.exists(transcripts_file): 8552 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8553 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8554 else: 8555 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8556 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8557 8558 # Added columns 8559 added_columns = [] 8560 8561 # Explode HGVS field in column 8562 added_columns += self.explode_infos(fields=[hgvs_field]) 8563 8564 # extra infos 8565 extra_infos = self.get_extra_infos() 8566 extra_field = prefix + hgvs_field 8567 8568 if extra_field in extra_infos: 8569 8570 # Create dataframe 8571 dataframe_hgvs = self.get_query_to_df( 8572 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8573 ) 8574 8575 # Create main NOMEN column 8576 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8577 lambda x: find_nomen(str(x), transcripts=transcripts) 8578 ) 8579 8580 # Explode NOMEN Structure and create SQL set for update 8581 sql_nomen_fields = [] 8582 for nomen_field in nomen_dict: 8583 8584 # Explode each field into a column 8585 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8586 lambda x: dict(x).get(nomen_field, "") 8587 ) 8588 8589 # Create VCF header field 8590 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8591 nomen_field, 8592 ".", 8593 "String", 8594 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8595 "howard calculation", 8596 "0", 8597 self.code_type_map.get("String"), 8598 ) 8599 sql_nomen_fields.append( 8600 f""" 8601 CASE 8602 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8603 THEN concat( 8604 ';{nomen_field}=', 8605 dataframe_hgvs."{nomen_field}" 8606 ) 8607 ELSE '' 8608 END 8609 """ 8610 ) 8611 8612 # SQL set for update 8613 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8614 8615 # Update 8616 sql_update = f""" 8617 UPDATE variants 8618 SET "INFO" = 8619 concat( 8620 CASE 8621 WHEN "INFO" IS NULL 8622 THEN '' 8623 ELSE "INFO" 8624 END, 8625 {sql_nomen_fields_set} 8626 ) 8627 FROM dataframe_hgvs 8628 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8629 AND variants."POS" = dataframe_hgvs."POS" 8630 AND variants."REF" = dataframe_hgvs."REF" 8631 AND variants."ALT" = dataframe_hgvs."ALT" 8632 """ 8633 self.conn.execute(sql_update) 8634 8635 # Delete dataframe 8636 del dataframe_hgvs 8637 gc.collect() 8638 8639 # Remove added columns 8640 for added_column in added_columns: 8641 self.drop_column(column=added_column)
This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8643 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8644 """ 8645 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8646 pipeline/sample for a variant and updates the variant information in a VCF file. 8647 8648 :param tag: The `tag` parameter is a string that represents the annotation field for the 8649 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8650 VCF header and to update the corresponding field in the variants table, defaults to 8651 findbypipeline 8652 :type tag: str (optional) 8653 """ 8654 8655 # if FORMAT and samples 8656 if ( 8657 "FORMAT" in self.get_header_columns_as_list() 8658 and self.get_header_sample_list() 8659 ): 8660 8661 # findbypipeline annotation field 8662 findbypipeline_tag = tag 8663 8664 # VCF infos tags 8665 vcf_infos_tags = { 8666 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8667 } 8668 8669 # Prefix 8670 prefix = self.get_explode_infos_prefix() 8671 8672 # Field 8673 findbypipeline_infos = prefix + findbypipeline_tag 8674 8675 # Variants table 8676 table_variants = self.get_table_variants() 8677 8678 # Header 8679 vcf_reader = self.get_header() 8680 8681 # Create variant id 8682 variant_id_column = self.get_variant_id_column() 8683 added_columns = [variant_id_column] 8684 8685 # variant_id, FORMAT and samples 8686 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8687 self.get_header_sample_list() 8688 ) 8689 8690 # Create dataframe 8691 dataframe_findbypipeline = self.get_query_to_df( 8692 f""" SELECT {samples_fields} FROM {table_variants} """ 8693 ) 8694 8695 # Create findbypipeline column 8696 dataframe_findbypipeline[findbypipeline_infos] = ( 8697 dataframe_findbypipeline.apply( 8698 lambda row: findbypipeline( 8699 row, samples=self.get_header_sample_list() 8700 ), 8701 axis=1, 8702 ) 8703 ) 8704 8705 # Add snpeff_hgvs to header 8706 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8707 findbypipeline_tag, 8708 ".", 8709 "String", 8710 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8711 "howard calculation", 8712 "0", 8713 self.code_type_map.get("String"), 8714 ) 8715 8716 # Update 8717 sql_update = f""" 8718 UPDATE variants 8719 SET "INFO" = 8720 concat( 8721 CASE 8722 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8723 THEN '' 8724 ELSE concat("INFO", ';') 8725 END, 8726 CASE 8727 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8728 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8729 THEN concat( 8730 '{findbypipeline_tag}=', 8731 dataframe_findbypipeline."{findbypipeline_infos}" 8732 ) 8733 ELSE '' 8734 END 8735 ) 8736 FROM dataframe_findbypipeline 8737 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8738 """ 8739 self.conn.execute(sql_update) 8740 8741 # Remove added columns 8742 for added_column in added_columns: 8743 self.drop_column(column=added_column) 8744 8745 # Delete dataframe 8746 del dataframe_findbypipeline 8747 gc.collect()
The function calculation_find_by_pipeline performs a calculation to find the number of
pipeline/sample for a variant and updates the variant information in a VCF file.
Parameters
- tag: The
tagparameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
8749 def calculation_genotype_concordance(self) -> None: 8750 """ 8751 The function `calculation_genotype_concordance` calculates the genotype concordance for 8752 multi-caller VCF files and updates the variant information in the database. 8753 """ 8754 8755 # if FORMAT and samples 8756 if ( 8757 "FORMAT" in self.get_header_columns_as_list() 8758 and self.get_header_sample_list() 8759 ): 8760 8761 # genotypeconcordance annotation field 8762 genotypeconcordance_tag = "genotypeconcordance" 8763 8764 # VCF infos tags 8765 vcf_infos_tags = { 8766 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8767 } 8768 8769 # Prefix 8770 prefix = self.get_explode_infos_prefix() 8771 8772 # Field 8773 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8774 8775 # Variants table 8776 table_variants = self.get_table_variants() 8777 8778 # Header 8779 vcf_reader = self.get_header() 8780 8781 # Create variant id 8782 variant_id_column = self.get_variant_id_column() 8783 added_columns = [variant_id_column] 8784 8785 # variant_id, FORMAT and samples 8786 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8787 self.get_header_sample_list() 8788 ) 8789 8790 # Create dataframe 8791 dataframe_genotypeconcordance = self.get_query_to_df( 8792 f""" SELECT {samples_fields} FROM {table_variants} """ 8793 ) 8794 8795 # Create genotypeconcordance column 8796 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8797 dataframe_genotypeconcordance.apply( 8798 lambda row: genotypeconcordance( 8799 row, samples=self.get_header_sample_list() 8800 ), 8801 axis=1, 8802 ) 8803 ) 8804 8805 # Add genotypeconcordance to header 8806 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8807 genotypeconcordance_tag, 8808 ".", 8809 "String", 8810 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8811 "howard calculation", 8812 "0", 8813 self.code_type_map.get("String"), 8814 ) 8815 8816 # Update 8817 sql_update = f""" 8818 UPDATE variants 8819 SET "INFO" = 8820 concat( 8821 CASE 8822 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8823 THEN '' 8824 ELSE concat("INFO", ';') 8825 END, 8826 CASE 8827 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8828 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8829 THEN concat( 8830 '{genotypeconcordance_tag}=', 8831 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8832 ) 8833 ELSE '' 8834 END 8835 ) 8836 FROM dataframe_genotypeconcordance 8837 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8838 """ 8839 self.conn.execute(sql_update) 8840 8841 # Remove added columns 8842 for added_column in added_columns: 8843 self.drop_column(column=added_column) 8844 8845 # Delete dataframe 8846 del dataframe_genotypeconcordance 8847 gc.collect()
The function calculation_genotype_concordance calculates the genotype concordance for
multi-caller VCF files and updates the variant information in the database.
8849 def calculation_barcode(self, tag: str = "barcode") -> None: 8850 """ 8851 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8852 updates the INFO field in the file with the calculated barcode values. 8853 8854 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8855 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8856 the default tag name is set to "barcode", defaults to barcode 8857 :type tag: str (optional) 8858 """ 8859 8860 # if FORMAT and samples 8861 if ( 8862 "FORMAT" in self.get_header_columns_as_list() 8863 and self.get_header_sample_list() 8864 ): 8865 8866 # barcode annotation field 8867 if not tag: 8868 tag = "barcode" 8869 8870 # VCF infos tags 8871 vcf_infos_tags = { 8872 tag: "barcode calculation (VaRank)", 8873 } 8874 8875 # Prefix 8876 prefix = self.get_explode_infos_prefix() 8877 8878 # Field 8879 barcode_infos = prefix + tag 8880 8881 # Variants table 8882 table_variants = self.get_table_variants() 8883 8884 # Header 8885 vcf_reader = self.get_header() 8886 8887 # Create variant id 8888 variant_id_column = self.get_variant_id_column() 8889 added_columns = [variant_id_column] 8890 8891 # variant_id, FORMAT and samples 8892 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8893 self.get_header_sample_list() 8894 ) 8895 8896 # Create dataframe 8897 dataframe_barcode = self.get_query_to_df( 8898 f""" SELECT {samples_fields} FROM {table_variants} """ 8899 ) 8900 8901 # Create barcode column 8902 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8903 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8904 ) 8905 8906 # Add barcode to header 8907 vcf_reader.infos[tag] = vcf.parser._Info( 8908 tag, 8909 ".", 8910 "String", 8911 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8912 "howard calculation", 8913 "0", 8914 self.code_type_map.get("String"), 8915 ) 8916 8917 # Update 8918 sql_update = f""" 8919 UPDATE {table_variants} 8920 SET "INFO" = 8921 concat( 8922 CASE 8923 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8924 THEN '' 8925 ELSE concat("INFO", ';') 8926 END, 8927 CASE 8928 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8929 AND dataframe_barcode."{barcode_infos}" NOT NULL 8930 THEN concat( 8931 '{tag}=', 8932 dataframe_barcode."{barcode_infos}" 8933 ) 8934 ELSE '' 8935 END 8936 ) 8937 FROM dataframe_barcode 8938 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8939 """ 8940 self.conn.execute(sql_update) 8941 8942 # Remove added columns 8943 for added_column in added_columns: 8944 self.drop_column(column=added_column) 8945 8946 # Delete dataframe 8947 del dataframe_barcode 8948 gc.collect()
The calculation_barcode function calculates barcode values for variants in a VCF file and
updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcodefunction is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
8950 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8951 """ 8952 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8953 and updates the INFO field in the file with the calculated barcode values. 8954 8955 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8956 the barcode tag that will be added to the VCF file during the calculation process. If no value 8957 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8958 :type tag: str (optional) 8959 """ 8960 8961 # if FORMAT and samples 8962 if ( 8963 "FORMAT" in self.get_header_columns_as_list() 8964 and self.get_header_sample_list() 8965 ): 8966 8967 # barcode annotation field 8968 if not tag: 8969 tag = "BCF" 8970 8971 # VCF infos tags 8972 vcf_infos_tags = { 8973 tag: "barcode family calculation", 8974 f"{tag}S": "barcode family samples", 8975 } 8976 8977 # Param 8978 param = self.get_param() 8979 log.debug(f"param={param}") 8980 8981 # Prefix 8982 prefix = self.get_explode_infos_prefix() 8983 8984 # PED param 8985 ped = ( 8986 param.get("calculation", {}) 8987 .get("calculations", {}) 8988 .get("BARCODEFAMILY", {}) 8989 .get("family_pedigree", None) 8990 ) 8991 log.debug(f"ped={ped}") 8992 8993 # Load PED 8994 if ped: 8995 8996 # Pedigree is a file 8997 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8998 log.debug("Pedigree is file") 8999 with open(full_path(ped)) as ped: 9000 ped = json.load(ped) 9001 9002 # Pedigree is a string 9003 elif isinstance(ped, str): 9004 log.debug("Pedigree is str") 9005 try: 9006 ped = json.loads(ped) 9007 log.debug("Pedigree is json str") 9008 except ValueError as e: 9009 ped_samples = ped.split(",") 9010 ped = {} 9011 for ped_sample in ped_samples: 9012 ped[ped_sample] = ped_sample 9013 9014 # Pedigree is a dict 9015 elif isinstance(ped, dict): 9016 log.debug("Pedigree is dict") 9017 9018 # Pedigree is not well formatted 9019 else: 9020 msg_error = "Pedigree not well formatted" 9021 log.error(msg_error) 9022 raise ValueError(msg_error) 9023 9024 # Construct list 9025 ped_samples = list(ped.values()) 9026 9027 else: 9028 log.debug("Pedigree not defined. Take all samples") 9029 ped_samples = self.get_header_sample_list() 9030 ped = {} 9031 for ped_sample in ped_samples: 9032 ped[ped_sample] = ped_sample 9033 9034 # Check pedigree 9035 if not ped or len(ped) == 0: 9036 msg_error = f"Error in pedigree: samples {ped_samples}" 9037 log.error(msg_error) 9038 raise ValueError(msg_error) 9039 9040 # Log 9041 log.info( 9042 "Calculation 'BARCODEFAMILY' - Samples: " 9043 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9044 ) 9045 log.debug(f"ped_samples={ped_samples}") 9046 9047 # Field 9048 barcode_infos = prefix + tag 9049 9050 # Variants table 9051 table_variants = self.get_table_variants() 9052 9053 # Header 9054 vcf_reader = self.get_header() 9055 9056 # Create variant id 9057 variant_id_column = self.get_variant_id_column() 9058 added_columns = [variant_id_column] 9059 9060 # variant_id, FORMAT and samples 9061 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9062 ped_samples 9063 ) 9064 9065 # Create dataframe 9066 dataframe_barcode = self.get_query_to_df( 9067 f""" SELECT {samples_fields} FROM {table_variants} """ 9068 ) 9069 9070 # Create barcode column 9071 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9072 lambda row: barcode(row, samples=ped_samples), axis=1 9073 ) 9074 9075 # Add barcode family to header 9076 # Add vaf_normalization to header 9077 vcf_reader.formats[tag] = vcf.parser._Format( 9078 id=tag, 9079 num=".", 9080 type="String", 9081 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9082 type_code=self.code_type_map.get("String"), 9083 ) 9084 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9085 id=f"{tag}S", 9086 num=".", 9087 type="String", 9088 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9089 type_code=self.code_type_map.get("String"), 9090 ) 9091 9092 # Update 9093 # for sample in ped_samples: 9094 sql_update_set = [] 9095 for sample in self.get_header_sample_list() + ["FORMAT"]: 9096 if sample in ped_samples: 9097 value = f'dataframe_barcode."{barcode_infos}"' 9098 value_samples = "'" + ",".join(ped_samples) + "'" 9099 elif sample == "FORMAT": 9100 value = f"'{tag}'" 9101 value_samples = f"'{tag}S'" 9102 else: 9103 value = "'.'" 9104 value_samples = "'.'" 9105 format_regex = r"[a-zA-Z0-9\s]" 9106 sql_update_set.append( 9107 f""" 9108 "{sample}" = 9109 concat( 9110 CASE 9111 WHEN {table_variants}."{sample}" = './.' 9112 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9113 ELSE {table_variants}."{sample}" 9114 END, 9115 ':', 9116 {value}, 9117 ':', 9118 {value_samples} 9119 ) 9120 """ 9121 ) 9122 9123 sql_update_set_join = ", ".join(sql_update_set) 9124 sql_update = f""" 9125 UPDATE {table_variants} 9126 SET {sql_update_set_join} 9127 FROM dataframe_barcode 9128 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9129 """ 9130 self.conn.execute(sql_update) 9131 9132 # Remove added columns 9133 for added_column in added_columns: 9134 self.drop_column(column=added_column) 9135 9136 # Delete dataframe 9137 del dataframe_barcode 9138 gc.collect()
The calculation_barcode_family function calculates barcode values for variants in a VCF file
and updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcode_familyfunction is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for thetagparameter, the default value used is "BCF", defaults to BCF
9140 def calculation_trio(self) -> None: 9141 """ 9142 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9143 information to the INFO field of each variant. 9144 """ 9145 9146 # if FORMAT and samples 9147 if ( 9148 "FORMAT" in self.get_header_columns_as_list() 9149 and self.get_header_sample_list() 9150 ): 9151 9152 # trio annotation field 9153 trio_tag = "trio" 9154 9155 # VCF infos tags 9156 vcf_infos_tags = { 9157 "trio": "trio calculation", 9158 } 9159 9160 # Param 9161 param = self.get_param() 9162 9163 # Prefix 9164 prefix = self.get_explode_infos_prefix() 9165 9166 # Trio param 9167 trio_ped = ( 9168 param.get("calculation", {}) 9169 .get("calculations", {}) 9170 .get("TRIO", {}) 9171 .get("trio_pedigree", None) 9172 ) 9173 9174 # Load trio 9175 if trio_ped: 9176 9177 # Trio pedigree is a file 9178 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9179 log.debug("TRIO pedigree is file") 9180 with open(full_path(trio_ped)) as trio_ped: 9181 trio_ped = json.load(trio_ped) 9182 9183 # Trio pedigree is a string 9184 elif isinstance(trio_ped, str): 9185 log.debug("TRIO pedigree is str") 9186 try: 9187 trio_ped = json.loads(trio_ped) 9188 log.debug("TRIO pedigree is json str") 9189 except ValueError as e: 9190 trio_samples = trio_ped.split(",") 9191 if len(trio_samples) == 3: 9192 trio_ped = { 9193 "father": trio_samples[0], 9194 "mother": trio_samples[1], 9195 "child": trio_samples[2], 9196 } 9197 log.debug("TRIO pedigree is list str") 9198 else: 9199 msg_error = "TRIO pedigree not well formatted" 9200 log.error(msg_error) 9201 raise ValueError(msg_error) 9202 9203 # Trio pedigree is a dict 9204 elif isinstance(trio_ped, dict): 9205 log.debug("TRIO pedigree is dict") 9206 9207 # Trio pedigree is not well formatted 9208 else: 9209 msg_error = "TRIO pedigree not well formatted" 9210 log.error(msg_error) 9211 raise ValueError(msg_error) 9212 9213 # Construct trio list 9214 trio_samples = [ 9215 trio_ped.get("father", ""), 9216 trio_ped.get("mother", ""), 9217 trio_ped.get("child", ""), 9218 ] 9219 9220 else: 9221 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9222 samples_list = self.get_header_sample_list() 9223 if len(samples_list) >= 3: 9224 trio_samples = self.get_header_sample_list()[0:3] 9225 trio_ped = { 9226 "father": trio_samples[0], 9227 "mother": trio_samples[1], 9228 "child": trio_samples[2], 9229 } 9230 else: 9231 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9232 log.error(msg_error) 9233 raise ValueError(msg_error) 9234 9235 # Check trio pedigree 9236 if not trio_ped or len(trio_ped) != 3: 9237 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9238 log.error(msg_error) 9239 raise ValueError(msg_error) 9240 9241 # Log 9242 log.info( 9243 f"Calculation 'TRIO' - Samples: " 9244 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9245 ) 9246 9247 # Field 9248 trio_infos = prefix + trio_tag 9249 9250 # Variants table 9251 table_variants = self.get_table_variants() 9252 9253 # Header 9254 vcf_reader = self.get_header() 9255 9256 # Create variant id 9257 variant_id_column = self.get_variant_id_column() 9258 added_columns = [variant_id_column] 9259 9260 # variant_id, FORMAT and samples 9261 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9262 self.get_header_sample_list() 9263 ) 9264 9265 # Create dataframe 9266 dataframe_trio = self.get_query_to_df( 9267 f""" SELECT {samples_fields} FROM {table_variants} """ 9268 ) 9269 9270 # Create trio column 9271 dataframe_trio[trio_infos] = dataframe_trio.apply( 9272 lambda row: trio(row, samples=trio_samples), axis=1 9273 ) 9274 9275 # Add trio to header 9276 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9277 trio_tag, 9278 ".", 9279 "String", 9280 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9281 "howard calculation", 9282 "0", 9283 self.code_type_map.get("String"), 9284 ) 9285 9286 # Update 9287 sql_update = f""" 9288 UPDATE {table_variants} 9289 SET "INFO" = 9290 concat( 9291 CASE 9292 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9293 THEN '' 9294 ELSE concat("INFO", ';') 9295 END, 9296 CASE 9297 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9298 AND dataframe_trio."{trio_infos}" NOT NULL 9299 THEN concat( 9300 '{trio_tag}=', 9301 dataframe_trio."{trio_infos}" 9302 ) 9303 ELSE '' 9304 END 9305 ) 9306 FROM dataframe_trio 9307 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9308 """ 9309 self.conn.execute(sql_update) 9310 9311 # Remove added columns 9312 for added_column in added_columns: 9313 self.drop_column(column=added_column) 9314 9315 # Delete dataframe 9316 del dataframe_trio 9317 gc.collect()
The calculation_trio function performs trio calculations on a VCF file by adding trio
information to the INFO field of each variant.
9319 def calculation_vaf_normalization(self) -> None: 9320 """ 9321 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9322 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9323 :return: The function does not return anything. 9324 """ 9325 9326 # if FORMAT and samples 9327 if ( 9328 "FORMAT" in self.get_header_columns_as_list() 9329 and self.get_header_sample_list() 9330 ): 9331 9332 # vaf_normalization annotation field 9333 vaf_normalization_tag = "VAF" 9334 9335 # VCF infos tags 9336 vcf_infos_tags = { 9337 "VAF": "VAF Variant Frequency", 9338 } 9339 9340 # Prefix 9341 prefix = self.get_explode_infos_prefix() 9342 9343 # Variants table 9344 table_variants = self.get_table_variants() 9345 9346 # Header 9347 vcf_reader = self.get_header() 9348 9349 # Do not calculate if VAF already exists 9350 if "VAF" in vcf_reader.formats: 9351 log.debug("VAF already on genotypes") 9352 return 9353 9354 # Create variant id 9355 variant_id_column = self.get_variant_id_column() 9356 added_columns = [variant_id_column] 9357 9358 # variant_id, FORMAT and samples 9359 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9360 f""" "{sample}" """ for sample in self.get_header_sample_list() 9361 ) 9362 9363 # Create dataframe 9364 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9365 log.debug(f"query={query}") 9366 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9367 9368 vaf_normalization_set = [] 9369 9370 # for each sample vaf_normalization 9371 for sample in self.get_header_sample_list(): 9372 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9373 lambda row: vaf_normalization(row, sample=sample), axis=1 9374 ) 9375 vaf_normalization_set.append( 9376 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9377 ) 9378 9379 # Add VAF to FORMAT 9380 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9381 "FORMAT" 9382 ].apply(lambda x: str(x) + ":VAF") 9383 vaf_normalization_set.append( 9384 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9385 ) 9386 9387 # Add vaf_normalization to header 9388 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9389 id=vaf_normalization_tag, 9390 num="1", 9391 type="Float", 9392 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9393 type_code=self.code_type_map.get("Float"), 9394 ) 9395 9396 # Create fields to add in INFO 9397 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9398 9399 # Update 9400 sql_update = f""" 9401 UPDATE {table_variants} 9402 SET {sql_vaf_normalization_set} 9403 FROM dataframe_vaf_normalization 9404 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9405 9406 """ 9407 self.conn.execute(sql_update) 9408 9409 # Remove added columns 9410 for added_column in added_columns: 9411 self.drop_column(column=added_column) 9412 9413 # Delete dataframe 9414 del dataframe_vaf_normalization 9415 gc.collect()
The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency)
normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
Returns
The function does not return anything.
9417 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9418 """ 9419 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9420 field in a VCF file and updates the INFO column of the variants table with the calculated 9421 statistics. 9422 9423 :param info: The `info` parameter is a string that represents the type of information for which 9424 genotype statistics are calculated. It is used to generate various VCF info tags for the 9425 statistics, such as the number of occurrences, the list of values, the minimum value, the 9426 maximum value, the mean, the median, defaults to VAF 9427 :type info: str (optional) 9428 """ 9429 9430 # if FORMAT and samples 9431 if ( 9432 "FORMAT" in self.get_header_columns_as_list() 9433 and self.get_header_sample_list() 9434 ): 9435 9436 # vaf_stats annotation field 9437 vaf_stats_tag = info + "_stats" 9438 9439 # VCF infos tags 9440 vcf_infos_tags = { 9441 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9442 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9443 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9444 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9445 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9446 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9447 info 9448 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9449 } 9450 9451 # Prefix 9452 prefix = self.get_explode_infos_prefix() 9453 9454 # Field 9455 vaf_stats_infos = prefix + vaf_stats_tag 9456 9457 # Variants table 9458 table_variants = self.get_table_variants() 9459 9460 # Header 9461 vcf_reader = self.get_header() 9462 9463 # Create variant id 9464 variant_id_column = self.get_variant_id_column() 9465 added_columns = [variant_id_column] 9466 9467 # variant_id, FORMAT and samples 9468 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9469 self.get_header_sample_list() 9470 ) 9471 9472 # Create dataframe 9473 dataframe_vaf_stats = self.get_query_to_df( 9474 f""" SELECT {samples_fields} FROM {table_variants} """ 9475 ) 9476 9477 # Create vaf_stats column 9478 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9479 lambda row: genotype_stats( 9480 row, samples=self.get_header_sample_list(), info=info 9481 ), 9482 axis=1, 9483 ) 9484 9485 # List of vcf tags 9486 sql_vaf_stats_fields = [] 9487 9488 # Check all VAF stats infos 9489 for stat in vcf_infos_tags: 9490 9491 # Extract stats 9492 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9493 lambda x: dict(x).get(stat, "") 9494 ) 9495 9496 # Add snpeff_hgvs to header 9497 vcf_reader.infos[stat] = vcf.parser._Info( 9498 stat, 9499 ".", 9500 "String", 9501 vcf_infos_tags.get(stat, "genotype statistics"), 9502 "howard calculation", 9503 "0", 9504 self.code_type_map.get("String"), 9505 ) 9506 9507 if len(sql_vaf_stats_fields): 9508 sep = ";" 9509 else: 9510 sep = "" 9511 9512 # Create fields to add in INFO 9513 sql_vaf_stats_fields.append( 9514 f""" 9515 CASE 9516 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9517 THEN concat( 9518 '{sep}{stat}=', 9519 dataframe_vaf_stats."{stat}" 9520 ) 9521 ELSE '' 9522 END 9523 """ 9524 ) 9525 9526 # SQL set for update 9527 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9528 9529 # Update 9530 sql_update = f""" 9531 UPDATE {table_variants} 9532 SET "INFO" = 9533 concat( 9534 CASE 9535 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9536 THEN '' 9537 ELSE concat("INFO", ';') 9538 END, 9539 {sql_vaf_stats_fields_set} 9540 ) 9541 FROM dataframe_vaf_stats 9542 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9543 9544 """ 9545 self.conn.execute(sql_update) 9546 9547 # Remove added columns 9548 for added_column in added_columns: 9549 self.drop_column(column=added_column) 9550 9551 # Delete dataframe 9552 del dataframe_vaf_stats 9553 gc.collect()
The calculation_genotype_stats function calculates genotype statistics for a given information
field in a VCF file and updates the INFO column of the variants table with the calculated
statistics.
Parameters
- info: The
infoparameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
9555 def calculation_transcripts_annotation( 9556 self, info_json: str = None, info_format: str = None 9557 ) -> None: 9558 """ 9559 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9560 field to it if transcripts are available. 9561 9562 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9563 is a string parameter that represents the information field to be used in the transcripts JSON. 9564 It is used to specify the JSON format for the transcripts information. If no value is provided 9565 when calling the method, it defaults to " 9566 :type info_json: str 9567 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9568 method is a string parameter that specifies the format of the information field to be used in 9569 the transcripts JSON. It is used to define the format of the information field 9570 :type info_format: str 9571 """ 9572 9573 # Create transcripts table 9574 transcripts_table = self.create_transcript_view() 9575 9576 # Add info field 9577 if transcripts_table: 9578 self.transcript_view_to_variants( 9579 transcripts_table=transcripts_table, 9580 transcripts_info_field_json=info_json, 9581 transcripts_info_field_format=info_format, 9582 ) 9583 else: 9584 log.info("No Transcripts to process. Check param.json file configuration")
The calculation_transcripts_annotation function creates a transcripts table and adds an info
field to it if transcripts are available.
Parameters
- info_json: The
info_jsonparameter in thecalculation_transcripts_annotationmethod is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to " - info_format: The
info_formatparameter in thecalculation_transcripts_annotationmethod is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
9586 def calculation_transcripts_prioritization(self) -> None: 9587 """ 9588 The function `calculation_transcripts_prioritization` creates a transcripts table and 9589 prioritizes transcripts based on certain criteria. 9590 """ 9591 9592 # Create transcripts table 9593 transcripts_table = self.create_transcript_view() 9594 9595 # Add info field 9596 if transcripts_table: 9597 self.transcripts_prioritization(transcripts_table=transcripts_table) 9598 else: 9599 log.info("No Transcripts to process. Check param.json file configuration")
The function calculation_transcripts_prioritization creates a transcripts table and
prioritizes transcripts based on certain criteria.
9605 def transcripts_prioritization( 9606 self, transcripts_table: str = None, param: dict = {} 9607 ) -> bool: 9608 """ 9609 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 9610 and updates the variants table with the prioritized information. 9611 9612 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 9613 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 9614 This parameter is used to identify the table where the transcripts data is stored for the 9615 prioritization process 9616 :type transcripts_table: str 9617 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 9618 that contains various configuration settings for the prioritization process of transcripts. It 9619 is used to customize the behavior of the prioritization algorithm and includes settings such as 9620 the prefix for prioritization fields, default profiles, and other 9621 :type param: dict 9622 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 9623 transcripts prioritization process is successfully completed, and `False` if there are any 9624 issues or if no profile is defined for transcripts prioritization. 9625 """ 9626 9627 log.debug("Start transcripts prioritization...") 9628 9629 # Param 9630 if not param: 9631 param = self.get_param() 9632 9633 # Variants table 9634 table_variants = self.get_table_variants() 9635 log.debug(f"transcripts_table={transcripts_table}") 9636 # Transcripts table 9637 if transcripts_table is None: 9638 log.debug(f"transcripts_table={transcripts_table}") 9639 transcripts_table = self.create_transcript_view( 9640 transcripts_table="transcripts", param=param 9641 ) 9642 log.debug(f"transcripts_table={transcripts_table}") 9643 if transcripts_table is None: 9644 msg_err = "No Transcripts table availalble" 9645 log.error(msg_err) 9646 raise ValueError(msg_err) 9647 9648 # Get transcripts columns 9649 columns_as_list_query = f""" 9650 DESCRIBE {transcripts_table} 9651 """ 9652 columns_as_list = list( 9653 self.get_query_to_df(columns_as_list_query)["column_name"] 9654 ) 9655 9656 # Create INFO if not exists 9657 if "INFO" not in columns_as_list: 9658 query_add_info = f""" 9659 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 9660 """ 9661 self.execute_query(query_add_info) 9662 9663 # Prioritization param and Force only PZ Score and Flag 9664 pz_param = param.get("transcripts", {}).get("prioritization", {}) 9665 pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score" 9666 pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag" 9667 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 9668 pz_param["pzfields"] = [pz_fields_score, pz_fields_flag] 9669 pz_profile_default = ( 9670 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 9671 ) 9672 9673 # Exit if no profile 9674 if pz_profile_default is None: 9675 log.warning("No profile defined for transcripts prioritization") 9676 return False 9677 9678 # Prioritization 9679 prioritization_result = self.prioritization( 9680 table=transcripts_table, 9681 pz_param=param.get("transcripts", {}).get("prioritization", {}), 9682 ) 9683 if not prioritization_result: 9684 log.warning("Transcripts prioritization not processed") 9685 return False 9686 9687 # Explode PZ fields 9688 self.explode_infos( 9689 table=transcripts_table, 9690 fields=param.get("transcripts", {}) 9691 .get("prioritization", {}) 9692 .get("pzfields", []), 9693 ) 9694 9695 # Export Transcripts prioritization infos to variants table 9696 query_update = f""" 9697 WITH RankedTranscripts AS ( 9698 SELECT 9699 "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag}, 9700 ROW_NUMBER() OVER ( 9701 PARTITION BY "#CHROM", POS, REF, ALT 9702 ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC 9703 ) AS rn 9704 FROM 9705 {transcripts_table} 9706 ) 9707 UPDATE {table_variants} 9708 SET 9709 INFO = CONCAT(CASE 9710 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9711 THEN '' 9712 ELSE concat("INFO", ';') 9713 END, 9714 concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag}) 9715 ) 9716 FROM 9717 RankedTranscripts 9718 WHERE 9719 rn = 1 9720 AND variants."#CHROM" = RankedTranscripts."#CHROM" 9721 AND variants."POS" = RankedTranscripts."POS" 9722 AND variants."REF" = RankedTranscripts."REF" 9723 AND variants."ALT" = RankedTranscripts."ALT" 9724 9725 """ 9726 self.execute_query(query=query_update) 9727 9728 # Add PZ Transcript in header 9729 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 9730 pz_fields_transcripts, 9731 ".", 9732 "String", 9733 f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}", 9734 "unknown", 9735 "unknown", 9736 code_type_map["String"], 9737 ) 9738 9739 # Return 9740 return True
The transcripts_prioritization function prioritizes transcripts based on certain parameters
and updates the variants table with the prioritized information.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process - param: The
paramparameter in thetranscripts_prioritizationmethod is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns
The function
transcripts_prioritizationreturns a boolean valueTrueif the transcripts prioritization process is successfully completed, andFalseif there are any issues or if no profile is defined for transcripts prioritization.
9742 def create_transcript_view_from_columns_map( 9743 self, 9744 transcripts_table: str = "transcripts", 9745 columns_maps: dict = {}, 9746 added_columns: list = [], 9747 temporary_tables: list = None, 9748 annotation_fields: list = None, 9749 ) -> tuple[list, list, list]: 9750 """ 9751 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9752 specified columns mapping for transcripts data. 9753 9754 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9755 the table where the transcripts data is stored or will be stored in the database. This table 9756 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9757 predictions, etc. It defaults to "transcripts, defaults to transcripts 9758 :type transcripts_table: str (optional) 9759 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9760 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9761 represents a mapping configuration for a specific set of columns. It typically includes details such 9762 as the main transcript column and additional information columns 9763 :type columns_maps: dict 9764 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9765 function is a list that stores the additional columns that will be added to the view being created 9766 based on the columns map provided. These columns are generated by exploding the transcript 9767 information columns along with the main transcript column 9768 :type added_columns: list 9769 :param temporary_tables: The `temporary_tables` parameter in the 9770 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9771 tables created during the process of creating a transcript view from a columns map. These temporary 9772 tables are used to store intermediate results or transformations before the final view is generated 9773 :type temporary_tables: list 9774 :param annotation_fields: The `annotation_fields` parameter in the 9775 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9776 for annotation in the query view creation process. These fields are extracted from the 9777 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9778 :type annotation_fields: list 9779 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9780 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9781 """ 9782 9783 log.debug("Start transcrpts view creation from columns map...") 9784 9785 # "from_columns_map": [ 9786 # { 9787 # "transcripts_column": "Ensembl_transcriptid", 9788 # "transcripts_infos_columns": [ 9789 # "genename", 9790 # "Ensembl_geneid", 9791 # "LIST_S2_score", 9792 # "LIST_S2_pred", 9793 # ], 9794 # }, 9795 # { 9796 # "transcripts_column": "Ensembl_transcriptid", 9797 # "transcripts_infos_columns": [ 9798 # "genename", 9799 # "VARITY_R_score", 9800 # "Aloft_pred", 9801 # ], 9802 # }, 9803 # ], 9804 9805 # Init 9806 if temporary_tables is None: 9807 temporary_tables = [] 9808 if annotation_fields is None: 9809 annotation_fields = [] 9810 9811 # Variants table 9812 table_variants = self.get_table_variants() 9813 9814 for columns_map in columns_maps: 9815 9816 # Transcript column 9817 transcripts_column = columns_map.get("transcripts_column", None) 9818 9819 # Transcripts infos columns 9820 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9821 9822 if transcripts_column is not None: 9823 9824 # Explode 9825 added_columns += self.explode_infos( 9826 fields=[transcripts_column] + transcripts_infos_columns 9827 ) 9828 9829 # View clauses 9830 clause_select = [] 9831 for field in [transcripts_column] + transcripts_infos_columns: 9832 clause_select.append( 9833 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9834 ) 9835 if field not in [transcripts_column]: 9836 annotation_fields.append(field) 9837 9838 # Querey View 9839 query = f""" 9840 SELECT 9841 "#CHROM", POS, REF, ALT, INFO, 9842 "{transcripts_column}" AS 'transcript', 9843 {", ".join(clause_select)} 9844 FROM ( 9845 SELECT 9846 "#CHROM", POS, REF, ALT, INFO, 9847 {", ".join(clause_select)} 9848 FROM {table_variants} 9849 ) 9850 WHERE "{transcripts_column}" IS NOT NULL 9851 """ 9852 9853 # Create temporary table 9854 temporary_table = transcripts_table + "".join( 9855 random.choices(string.ascii_uppercase + string.digits, k=10) 9856 ) 9857 9858 # Temporary_tables 9859 temporary_tables.append(temporary_table) 9860 query_view = f""" 9861 CREATE TEMPORARY TABLE {temporary_table} 9862 AS ({query}) 9863 """ 9864 self.execute_query(query=query_view) 9865 9866 return added_columns, temporary_tables, annotation_fields
The create_transcript_view_from_columns_map function generates a temporary table view based on
specified columns mapping for transcripts data.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts - columns_maps: The
columns_mapsparameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in thecolumns_mapslist represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns - added_columns: The
added_columnsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from thetranscripts_columnandtranscripts_infos_columnsspecified in the `columns
Returns
The function
create_transcript_view_from_columns_mapreturns a tuple containing three lists:added_columns,temporary_tables, andannotation_fields.
9868 def create_transcript_view_from_column_format( 9869 self, 9870 transcripts_table: str = "transcripts", 9871 column_formats: dict = {}, 9872 temporary_tables: list = None, 9873 annotation_fields: list = None, 9874 ) -> tuple[list, list, list]: 9875 """ 9876 The `create_transcript_view_from_column_format` function generates a transcript view based on 9877 specified column formats, adds additional columns and annotation fields, and returns the list of 9878 temporary tables and annotation fields. 9879 9880 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9881 the table containing the transcripts data. This table will be used as the base table for creating 9882 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9883 different table name if needed, defaults to transcripts 9884 :type transcripts_table: str (optional) 9885 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9886 about the columns to be used for creating the transcript view. Each entry in the dictionary 9887 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9888 the provided code snippet: 9889 :type column_formats: dict 9890 :param temporary_tables: The `temporary_tables` parameter in the 9891 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9892 views created during the process of creating a transcript view from a column format. These temporary 9893 views are used to manipulate and extract data before generating the final transcript view. It 9894 :type temporary_tables: list 9895 :param annotation_fields: The `annotation_fields` parameter in the 9896 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9897 that are extracted from the temporary views created during the process. These annotation fields are 9898 obtained by querying the temporary views and extracting the column names excluding specific columns 9899 like `#CH 9900 :type annotation_fields: list 9901 :return: The `create_transcript_view_from_column_format` function returns two lists: 9902 `temporary_tables` and `annotation_fields`. 9903 """ 9904 9905 log.debug("Start transcrpts view creation from column format...") 9906 9907 # "from_column_format": [ 9908 # { 9909 # "transcripts_column": "ANN", 9910 # "transcripts_infos_column": "Feature_ID", 9911 # } 9912 # ], 9913 9914 # Init 9915 if temporary_tables is None: 9916 temporary_tables = [] 9917 if annotation_fields is None: 9918 annotation_fields = [] 9919 9920 for column_format in column_formats: 9921 9922 # annotation field and transcript annotation field 9923 annotation_field = column_format.get("transcripts_column", "ANN") 9924 transcript_annotation = column_format.get( 9925 "transcripts_infos_column", "Feature_ID" 9926 ) 9927 9928 # Temporary View name 9929 temporary_view_name = transcripts_table + "".join( 9930 random.choices(string.ascii_uppercase + string.digits, k=10) 9931 ) 9932 9933 # Create temporary view name 9934 temporary_view_name = self.annotation_format_to_table( 9935 uniquify=True, 9936 annotation_field=annotation_field, 9937 view_name=temporary_view_name, 9938 annotation_id=transcript_annotation, 9939 ) 9940 9941 # Annotation fields 9942 if temporary_view_name: 9943 query_annotation_fields = f""" 9944 SELECT * 9945 FROM ( 9946 DESCRIBE SELECT * 9947 FROM {temporary_view_name} 9948 ) 9949 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9950 """ 9951 df_annotation_fields = self.get_query_to_df( 9952 query=query_annotation_fields 9953 ) 9954 9955 # Add temporary view and annotation fields 9956 temporary_tables.append(temporary_view_name) 9957 annotation_fields += list(set(df_annotation_fields["column_name"])) 9958 9959 return temporary_tables, annotation_fields
The create_transcript_view_from_column_format function generates a transcript view based on
specified column formats, adds additional columns and annotation fields, and returns the list of
temporary tables and annotation fields.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts - column_formats: The
column_formatsparameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. For example, in the provided code snippet: - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view. It - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
Returns
The
create_transcript_view_from_column_formatfunction returns two lists:temporary_tablesandannotation_fields.
9961 def create_transcript_view( 9962 self, 9963 transcripts_table: str = None, 9964 transcripts_table_drop: bool = True, 9965 param: dict = {}, 9966 ) -> str: 9967 """ 9968 The `create_transcript_view` function generates a transcript view by processing data from a 9969 specified table based on provided parameters and structural information. 9970 9971 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9972 is used to specify the name of the table that will store the final transcript view data. If a table 9973 name is not provided, the function will create a new table to store the transcript view data, and by 9974 default,, defaults to transcripts 9975 :type transcripts_table: str (optional) 9976 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9977 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9978 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9979 the function will drop the existing transcripts table if it exists, defaults to True 9980 :type transcripts_table_drop: bool (optional) 9981 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9982 contains information needed to create a transcript view. It includes details such as the structure 9983 of the transcripts, columns mapping, column formats, and other necessary information for generating 9984 the view. This parameter allows for flexibility and customization 9985 :type param: dict 9986 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9987 created or modified during the execution of the function. 9988 """ 9989 9990 log.debug("Start transcripts view creation...") 9991 9992 # Default 9993 transcripts_table_default = "transcripts" 9994 9995 # Param 9996 if not param: 9997 param = self.get_param() 9998 9999 # Struct 10000 struct = param.get("transcripts", {}).get("struct", None) 10001 10002 if struct: 10003 10004 # Transcripts table 10005 if transcripts_table is None: 10006 transcripts_table = param.get("transcripts", {}).get( 10007 "table", transcripts_table_default 10008 ) 10009 10010 # added_columns 10011 added_columns = [] 10012 10013 # Temporary tables 10014 temporary_tables = [] 10015 10016 # Annotation fields 10017 annotation_fields = [] 10018 10019 # from columns map 10020 columns_maps = struct.get("from_columns_map", []) 10021 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10022 self.create_transcript_view_from_columns_map( 10023 transcripts_table=transcripts_table, 10024 columns_maps=columns_maps, 10025 added_columns=added_columns, 10026 temporary_tables=temporary_tables, 10027 annotation_fields=annotation_fields, 10028 ) 10029 ) 10030 added_columns += added_columns_tmp 10031 temporary_tables += temporary_tables_tmp 10032 annotation_fields += annotation_fields_tmp 10033 10034 # from column format 10035 column_formats = struct.get("from_column_format", []) 10036 temporary_tables_tmp, annotation_fields_tmp = ( 10037 self.create_transcript_view_from_column_format( 10038 transcripts_table=transcripts_table, 10039 column_formats=column_formats, 10040 temporary_tables=temporary_tables, 10041 annotation_fields=annotation_fields, 10042 ) 10043 ) 10044 temporary_tables += temporary_tables_tmp 10045 annotation_fields += annotation_fields_tmp 10046 10047 # Merge temporary tables query 10048 query_merge = "" 10049 for temporary_table in temporary_tables: 10050 10051 # First temporary table 10052 if not query_merge: 10053 query_merge = f""" 10054 SELECT * FROM {temporary_table} 10055 """ 10056 # other temporary table (using UNION) 10057 else: 10058 query_merge += f""" 10059 UNION BY NAME SELECT * FROM {temporary_table} 10060 """ 10061 10062 # Merge on transcript 10063 query_merge_on_transcripts_annotation_fields = [] 10064 # Aggregate all annotations fields 10065 for annotation_field in set(annotation_fields): 10066 query_merge_on_transcripts_annotation_fields.append( 10067 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10068 ) 10069 # Query for transcripts view 10070 query_merge_on_transcripts = f""" 10071 SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 10072 FROM ({query_merge}) 10073 GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript 10074 """ 10075 10076 # Drop transcript view is necessary 10077 if transcripts_table_drop: 10078 query_drop = f""" 10079 DROP TABLE IF EXISTS {transcripts_table}; 10080 """ 10081 self.execute_query(query=query_drop) 10082 10083 # Merge and create transcript view 10084 query_create_view = f""" 10085 CREATE TABLE IF NOT EXISTS {transcripts_table} 10086 AS {query_merge_on_transcripts} 10087 """ 10088 self.execute_query(query=query_create_view) 10089 10090 # Remove added columns 10091 for added_column in added_columns: 10092 self.drop_column(column=added_column) 10093 10094 else: 10095 10096 transcripts_table = None 10097 10098 return transcripts_table
The create_transcript_view function generates a transcript view by processing data from a
specified table based on provided parameters and structural information.
Parameters
- transcripts_table: The
transcripts_tableparameter in thecreate_transcript_viewfunction is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts - transcripts_table_drop: The
transcripts_table_dropparameter in thecreate_transcript_viewfunction is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. Iftranscripts_table_dropis set toTrue, the function will drop the existing transcripts table if it exists, defaults to True - param: The
paramparameter in thecreate_transcript_viewfunction is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns
The
create_transcript_viewfunction returns the name of the transcripts table that was created or modified during the execution of the function.
10100 def annotation_format_to_table( 10101 self, 10102 uniquify: bool = True, 10103 annotation_field: str = "ANN", 10104 annotation_id: str = "Feature_ID", 10105 view_name: str = "transcripts", 10106 ) -> str: 10107 """ 10108 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 10109 table format. 10110 10111 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 10112 values in the output or not. If set to `True`, the function will make sure that the output values 10113 are unique, defaults to True 10114 :type uniquify: bool (optional) 10115 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 10116 contains the annotation information for each variant. This field is used to extract the annotation 10117 details for further processing in the function, defaults to ANN 10118 :type annotation_field: str (optional) 10119 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 10120 used to specify the identifier for the annotation feature. This identifier will be used as a column 10121 name in the resulting table or view that is created based on the annotation data. It helps in 10122 uniquely identifying each annotation entry in the, defaults to Feature_ID 10123 :type annotation_id: str (optional) 10124 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 10125 specify the name of the temporary table that will be created to store the transformed annotation 10126 data. This table will hold the extracted information from the annotation field in a structured 10127 format for further processing or analysis, defaults to transcripts 10128 :type view_name: str (optional) 10129 :return: The function `annotation_format_to_table` is returning the name of the view created, which 10130 is stored in the variable `view_name`. 10131 """ 10132 10133 # Annotation field 10134 annotation_format = "annotation_explode" 10135 10136 # Transcript annotation 10137 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 10138 10139 # Prefix 10140 prefix = self.get_explode_infos_prefix() 10141 if prefix: 10142 prefix = "INFO/" 10143 10144 # Annotation fields 10145 annotation_infos = prefix + annotation_field 10146 annotation_format_infos = prefix + annotation_format 10147 10148 # Variants table 10149 table_variants = self.get_table_variants() 10150 10151 # Header 10152 vcf_reader = self.get_header() 10153 10154 # Add columns 10155 added_columns = [] 10156 10157 # Explode HGVS field in column 10158 added_columns += self.explode_infos(fields=[annotation_field]) 10159 10160 if annotation_field in vcf_reader.infos: 10161 10162 # Extract ANN header 10163 ann_description = vcf_reader.infos[annotation_field].desc 10164 pattern = r"'(.+?)'" 10165 match = re.search(pattern, ann_description) 10166 if match: 10167 ann_header_match = match.group(1).split(" | ") 10168 ann_header = [] 10169 ann_header_desc = {} 10170 for i in range(len(ann_header_match)): 10171 ann_header_info = "".join( 10172 char for char in ann_header_match[i] if char.isalnum() 10173 ) 10174 ann_header.append(ann_header_info) 10175 ann_header_desc[ann_header_info] = ann_header_match[i] 10176 if not ann_header_desc: 10177 raise ValueError("Invalid header description format") 10178 else: 10179 raise ValueError("Invalid header description format") 10180 10181 # Create variant id 10182 variant_id_column = self.get_variant_id_column() 10183 added_columns += [variant_id_column] 10184 10185 # Create dataframe 10186 dataframe_annotation_format = self.get_query_to_df( 10187 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 10188 ) 10189 10190 # Create annotation columns 10191 dataframe_annotation_format[ 10192 annotation_format_infos 10193 ] = dataframe_annotation_format[annotation_infos].apply( 10194 lambda x: explode_annotation_format( 10195 annotation=str(x), 10196 uniquify=uniquify, 10197 output_format="JSON", 10198 prefix="", 10199 header=list(ann_header_desc.values()), 10200 ) 10201 ) 10202 10203 # Find keys 10204 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 10205 df_keys = self.get_query_to_df(query=query_json) 10206 10207 # Check keys 10208 query_json_key = [] 10209 for _, row in df_keys.iterrows(): 10210 10211 # Key 10212 key = row.iloc[0] 10213 10214 # key_clean 10215 key_clean = "".join(char for char in key if char.isalnum()) 10216 10217 # Type 10218 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 10219 10220 # Get DataFrame from query 10221 df_json_type = self.get_query_to_df(query=query_json_type) 10222 10223 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 10224 with pd.option_context("future.no_silent_downcasting", True): 10225 df_json_type.fillna(value="", inplace=True) 10226 replace_dict = {None: np.nan, "": np.nan} 10227 df_json_type.replace(replace_dict, inplace=True) 10228 df_json_type.dropna(inplace=True) 10229 10230 # Detect column type 10231 column_type = detect_column_type(df_json_type[key_clean]) 10232 10233 # Append 10234 query_json_key.append( 10235 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 10236 ) 10237 10238 # Create view 10239 query_view = f""" 10240 CREATE TEMPORARY TABLE {view_name} 10241 AS ( 10242 SELECT *, {annotation_id} AS 'transcript' 10243 FROM ( 10244 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 10245 FROM dataframe_annotation_format 10246 ) 10247 ); 10248 """ 10249 self.execute_query(query=query_view) 10250 10251 else: 10252 10253 # Return None 10254 view_name = None 10255 10256 # Remove added columns 10257 for added_column in added_columns: 10258 self.drop_column(column=added_column) 10259 10260 return view_name
The function annotation_format_to_table converts annotation data from a VCF file into a structured
table format.
Parameters
- uniquify: The
uniquifyparameter is a boolean flag that determines whether to ensure unique values in the output or not. If set toTrue, the function will make sure that the output values are unique, defaults to True - annotation_field: The
annotation_fieldparameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function, defaults to ANN - annotation_id: The
annotation_idparameter in theannotation_format_to_tablemethod is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID - view_name: The
view_nameparameter in theannotation_format_to_tablemethod is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis, defaults to transcripts
Returns
The function
annotation_format_to_tableis returning the name of the view created, which is stored in the variableview_name.
10262 def transcript_view_to_variants( 10263 self, 10264 transcripts_table: str = None, 10265 transcripts_column_id: str = None, 10266 transcripts_info_json: str = None, 10267 transcripts_info_field_json: str = None, 10268 transcripts_info_format: str = None, 10269 transcripts_info_field_format: str = None, 10270 param: dict = {}, 10271 ) -> bool: 10272 """ 10273 The `transcript_view_to_variants` function updates a variants table with information from 10274 transcripts in JSON format. 10275 10276 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 10277 table containing the transcripts data. If this parameter is not provided, the function will 10278 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 10279 :type transcripts_table: str 10280 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 10281 column in the `transcripts_table` that contains the unique identifier for each transcript. This 10282 identifier is used to match transcripts with variants in the database 10283 :type transcripts_column_id: str 10284 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 10285 of the column in the variants table where the transcripts information will be stored in JSON 10286 format. This parameter allows you to define the column in the variants table that will hold the 10287 JSON-formatted information about transcripts 10288 :type transcripts_info_json: str 10289 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 10290 specify the field in the VCF header that will contain information about transcripts in JSON 10291 format. This field will be added to the VCF header as an INFO field with the specified name 10292 :type transcripts_info_field_json: str 10293 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 10294 format of the information about transcripts that will be stored in the variants table. This 10295 format can be used to define how the transcript information will be structured or displayed 10296 within the variants table 10297 :type transcripts_info_format: str 10298 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 10299 specify the field in the VCF header that will contain information about transcripts in a 10300 specific format. This field will be added to the VCF header as an INFO field with the specified 10301 name 10302 :type transcripts_info_field_format: str 10303 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 10304 that contains various configuration settings related to transcripts. It is used to provide 10305 default values for certain parameters if they are not explicitly provided when calling the 10306 method. The `param` dictionary can be passed as an argument 10307 :type param: dict 10308 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 10309 if the operation is successful and `False` if certain conditions are not met. 10310 """ 10311 10312 msg_info_prefix = "Start transcripts view to variants annotations" 10313 10314 log.debug(f"{msg_info_prefix}...") 10315 10316 # Default 10317 transcripts_table_default = "transcripts" 10318 transcripts_column_id_default = "transcript" 10319 transcripts_info_json_default = None 10320 transcripts_info_format_default = None 10321 transcripts_info_field_json_default = None 10322 transcripts_info_field_format_default = None 10323 10324 # Param 10325 if not param: 10326 param = self.get_param() 10327 10328 # Transcripts table 10329 if transcripts_table is None: 10330 transcripts_table = param.get("transcripts", {}).get( 10331 "table", transcripts_table_default 10332 ) 10333 10334 # Transcripts column ID 10335 if transcripts_column_id is None: 10336 transcripts_column_id = param.get("transcripts", {}).get( 10337 "column_id", transcripts_column_id_default 10338 ) 10339 10340 # Transcripts info json 10341 if transcripts_info_json is None: 10342 transcripts_info_json = param.get("transcripts", {}).get( 10343 "transcripts_info_json", transcripts_info_json_default 10344 ) 10345 10346 # Transcripts info field JSON 10347 if transcripts_info_field_json is None: 10348 transcripts_info_field_json = param.get("transcripts", {}).get( 10349 "transcripts_info_field_json", transcripts_info_field_json_default 10350 ) 10351 # if transcripts_info_field_json is not None and transcripts_info_json is None: 10352 # transcripts_info_json = transcripts_info_field_json 10353 10354 # Transcripts info format 10355 if transcripts_info_format is None: 10356 transcripts_info_format = param.get("transcripts", {}).get( 10357 "transcripts_info_format", transcripts_info_format_default 10358 ) 10359 10360 # Transcripts info field FORMAT 10361 if transcripts_info_field_format is None: 10362 transcripts_info_field_format = param.get("transcripts", {}).get( 10363 "transcripts_info_field_format", transcripts_info_field_format_default 10364 ) 10365 # if ( 10366 # transcripts_info_field_format is not None 10367 # and transcripts_info_format is None 10368 # ): 10369 # transcripts_info_format = transcripts_info_field_format 10370 10371 # Variants table 10372 table_variants = self.get_table_variants() 10373 10374 # Check info columns param 10375 if ( 10376 transcripts_info_json is None 10377 and transcripts_info_field_json is None 10378 and transcripts_info_format is None 10379 and transcripts_info_field_format is None 10380 ): 10381 return False 10382 10383 # Transcripts infos columns 10384 query_transcripts_infos_columns = f""" 10385 SELECT * 10386 FROM ( 10387 DESCRIBE SELECT * FROM {transcripts_table} 10388 ) 10389 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 10390 """ 10391 transcripts_infos_columns = list( 10392 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 10393 ) 10394 10395 # View results 10396 clause_select = [] 10397 clause_to_json = [] 10398 clause_to_format = [] 10399 for field in transcripts_infos_columns: 10400 clause_select.append( 10401 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10402 ) 10403 clause_to_json.append(f""" '{field}': "{field}" """) 10404 clause_to_format.append(f""" "{field}" """) 10405 10406 # Update 10407 update_set_json = [] 10408 update_set_format = [] 10409 10410 # VCF header 10411 vcf_reader = self.get_header() 10412 10413 # Transcripts to info column in JSON 10414 if transcripts_info_json is not None: 10415 10416 # Create column on variants table 10417 self.add_column( 10418 table_name=table_variants, 10419 column_name=transcripts_info_json, 10420 column_type="JSON", 10421 default_value=None, 10422 drop=False, 10423 ) 10424 10425 # Add header 10426 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 10427 transcripts_info_json, 10428 ".", 10429 "String", 10430 "Transcripts in JSON format", 10431 "unknwon", 10432 "unknwon", 10433 self.code_type_map["String"], 10434 ) 10435 10436 # Add to update 10437 update_set_json.append( 10438 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 10439 ) 10440 10441 # Transcripts to info field in JSON 10442 if transcripts_info_field_json is not None: 10443 10444 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 10445 10446 # Add to update 10447 update_set_json.append( 10448 f""" 10449 INFO = concat( 10450 CASE 10451 WHEN INFO NOT IN ('', '.') 10452 THEN INFO 10453 ELSE '' 10454 END, 10455 CASE 10456 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 10457 THEN concat( 10458 ';{transcripts_info_field_json}=', 10459 t.{transcripts_info_json} 10460 ) 10461 ELSE '' 10462 END 10463 ) 10464 """ 10465 ) 10466 10467 # Add header 10468 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 10469 transcripts_info_field_json, 10470 ".", 10471 "String", 10472 "Transcripts in JSON format", 10473 "unknwon", 10474 "unknwon", 10475 self.code_type_map["String"], 10476 ) 10477 10478 if update_set_json: 10479 10480 # Update query 10481 query_update = f""" 10482 UPDATE {table_variants} 10483 SET {", ".join(update_set_json)} 10484 FROM 10485 ( 10486 SELECT 10487 "#CHROM", POS, REF, ALT, 10488 concat( 10489 '{{', 10490 string_agg( 10491 '"' || "{transcripts_column_id}" || '":' || 10492 to_json(json_output) 10493 ), 10494 '}}' 10495 )::JSON AS {transcripts_info_json} 10496 FROM 10497 ( 10498 SELECT 10499 "#CHROM", POS, REF, ALT, 10500 "{transcripts_column_id}", 10501 to_json( 10502 {{{",".join(clause_to_json)}}} 10503 )::JSON AS json_output 10504 FROM 10505 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10506 WHERE "{transcripts_column_id}" IS NOT NULL 10507 ) 10508 GROUP BY "#CHROM", POS, REF, ALT 10509 ) AS t 10510 WHERE {table_variants}."#CHROM" = t."#CHROM" 10511 AND {table_variants}."POS" = t."POS" 10512 AND {table_variants}."REF" = t."REF" 10513 AND {table_variants}."ALT" = t."ALT" 10514 """ 10515 10516 self.execute_query(query=query_update) 10517 10518 # Transcripts to info column in FORMAT 10519 if transcripts_info_format is not None: 10520 10521 # Create column on variants table 10522 self.add_column( 10523 table_name=table_variants, 10524 column_name=transcripts_info_format, 10525 column_type="VARCHAR", 10526 default_value=None, 10527 drop=False, 10528 ) 10529 10530 # Add header 10531 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 10532 transcripts_info_format, 10533 ".", 10534 "String", 10535 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10536 "unknwon", 10537 "unknwon", 10538 self.code_type_map["String"], 10539 ) 10540 10541 # Add to update 10542 update_set_format.append( 10543 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 10544 ) 10545 10546 # Transcripts to info field in JSON 10547 if transcripts_info_field_format is not None: 10548 10549 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 10550 10551 # Add to update 10552 update_set_format.append( 10553 f""" 10554 INFO = concat( 10555 CASE 10556 WHEN INFO NOT IN ('', '.') 10557 THEN INFO 10558 ELSE '' 10559 END, 10560 CASE 10561 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 10562 THEN concat( 10563 ';{transcripts_info_field_format}=', 10564 t.{transcripts_info_format} 10565 ) 10566 ELSE '' 10567 END 10568 ) 10569 """ 10570 ) 10571 10572 # Add header 10573 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 10574 transcripts_info_field_format, 10575 ".", 10576 "String", 10577 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10578 "unknwon", 10579 "unknwon", 10580 self.code_type_map["String"], 10581 ) 10582 10583 if update_set_format: 10584 10585 # Update query 10586 query_update = f""" 10587 UPDATE {table_variants} 10588 SET {", ".join(update_set_format)} 10589 FROM 10590 ( 10591 SELECT 10592 "#CHROM", POS, REF, ALT, 10593 string_agg({transcripts_info_format}) AS {transcripts_info_format} 10594 FROM 10595 ( 10596 SELECT 10597 "#CHROM", POS, REF, ALT, 10598 "{transcripts_column_id}", 10599 concat( 10600 "{transcripts_column_id}", 10601 '|', 10602 {", '|', ".join(clause_to_format)} 10603 ) AS {transcripts_info_format} 10604 FROM 10605 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10606 ) 10607 GROUP BY "#CHROM", POS, REF, ALT 10608 ) AS t 10609 WHERE {table_variants}."#CHROM" = t."#CHROM" 10610 AND {table_variants}."POS" = t."POS" 10611 AND {table_variants}."REF" = t."REF" 10612 AND {table_variants}."ALT" = t."ALT" 10613 """ 10614 10615 self.execute_query(query=query_update) 10616 10617 return True
The transcript_view_to_variants function updates a variants table with information from
transcripts in JSON format.
Parameters
- transcripts_table: The
transcripts_tableparameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from theparamdictionary or use a default value of "transcripts" - transcripts_column_id: The
transcripts_column_idparameter is used to specify the column in thetranscripts_tablethat contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database - transcripts_info_json: The
transcripts_info_jsonparameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts - transcripts_info_field_json: The
transcripts_info_field_jsonparameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name - transcripts_info_format: The
transcripts_info_formatparameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table - transcripts_info_field_format: The
transcripts_info_field_formatparameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name - param: The
paramparameter in thetranscript_view_to_variantsmethod is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. Theparamdictionary can be passed as an argument
Returns
The function
transcript_view_to_variantsreturns a boolean value. It returnsTrueif the operation is successful andFalseif certain conditions are not met.